code formatting
This commit is contained in:
commit
1138b2ac8e
|
@ -1,18 +1,18 @@
|
|||
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
|
||||
import com.ctc.wstx.dtd.LargePrefixedNameSet;
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
|
@ -29,7 +29,19 @@ public class PacePerson {
|
|||
private List<String> fullname = Lists.newArrayList();
|
||||
private final String original;
|
||||
|
||||
private static Set<String> particles = null;
|
||||
private static Set<String> particles;
|
||||
|
||||
static {
|
||||
try {
|
||||
particles = new HashSet<>(IOUtils
|
||||
.readLines(
|
||||
PacePerson.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/common/name_particles.txt")));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Capitalizes a string
|
||||
|
@ -37,29 +49,20 @@ public class PacePerson {
|
|||
* @param s the string to capitalize
|
||||
* @return the input string with capital letter
|
||||
*/
|
||||
public static final String capitalize(final String s) {
|
||||
public static String capitalize(final String s) {
|
||||
if (particles.contains(s)) {
|
||||
return s;
|
||||
}
|
||||
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a dot to a string with length equals to 1
|
||||
*/
|
||||
public static final String dotAbbreviations(final String s) {
|
||||
public static String dotAbbreviations(final String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
final Set<String> h = new HashSet<>();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
|
||||
h.add(s);
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
/**
|
||||
* The constructor of the class. It fills the fields of the class basing on the input fullname.
|
||||
*
|
||||
|
@ -128,10 +131,6 @@ public class PacePerson {
|
|||
}
|
||||
|
||||
private List<String> splitTerms(final String s) {
|
||||
if (particles == null) {
|
||||
particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt");
|
||||
}
|
||||
|
||||
final List<String> list = Lists.newArrayList();
|
||||
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
|
||||
if (!particles.contains(part.toLowerCase())) {
|
||||
|
@ -187,17 +186,36 @@ public class PacePerson {
|
|||
}
|
||||
|
||||
public List<String> getCapitalFirstnames() {
|
||||
return Lists
|
||||
.newArrayList(
|
||||
Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
|
||||
return Optional
|
||||
.ofNullable(getNameWithAbbreviations())
|
||||
.map(
|
||||
name -> name
|
||||
.stream()
|
||||
.map(PacePerson::capitalize)
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(new ArrayList<>());
|
||||
}
|
||||
|
||||
public List<String> getCapitalSurname() {
|
||||
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
|
||||
return Optional
|
||||
.ofNullable(getSurname())
|
||||
.map(
|
||||
surname -> surname
|
||||
.stream()
|
||||
.map(PacePerson::capitalize)
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(new ArrayList<>());
|
||||
}
|
||||
|
||||
public List<String> getNameWithAbbreviations() {
|
||||
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
|
||||
return Optional
|
||||
.ofNullable(getName())
|
||||
.map(
|
||||
name -> name
|
||||
.stream()
|
||||
.map(PacePerson::dotAbbreviations)
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(new ArrayList<>());
|
||||
}
|
||||
|
||||
public boolean isAccurate() {
|
||||
|
|
|
@ -3,6 +3,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
|
|||
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.sql.Array;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.function.Function;
|
||||
|
@ -118,6 +120,17 @@ public class OafMapperUtils {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static <T> List<T> listValues(Array values) throws SQLException {
|
||||
if (Objects.isNull(values)) {
|
||||
return null;
|
||||
}
|
||||
return Arrays
|
||||
.stream((T[]) values.getArray())
|
||||
.filter(Objects::nonNull)
|
||||
.distinct()
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
|
||||
return values
|
||||
.stream()
|
||||
|
@ -391,4 +404,19 @@ public class OafMapperUtils {
|
|||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static KeyValue newKeyValueInstance(String key, String value, DataInfo dataInfo) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setDataInfo(dataInfo);
|
||||
kv.setKey(key);
|
||||
kv.setValue(value);
|
||||
return kv;
|
||||
}
|
||||
|
||||
public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
|
||||
Measure m = new Measure();
|
||||
m.setId(id);
|
||||
m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo)));
|
||||
return m;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -75,9 +75,14 @@ public class DHPUtils {
|
|||
|
||||
final HttpGet req = new HttpGet(url);
|
||||
|
||||
log.info("MDStoreManager request: {}", req);
|
||||
|
||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
final String json = IOUtils.toString(response.getEntity().getContent());
|
||||
|
||||
log.info("MDStoreManager response: {}", json);
|
||||
|
||||
final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
|
||||
return Arrays
|
||||
.stream(mdstores)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
van
|
||||
von
|
||||
der
|
||||
de
|
||||
dell
|
|
@ -44,105 +44,104 @@ class OafMapperUtilsTest {
|
|||
@Test
|
||||
void testDateValidation() {
|
||||
|
||||
assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent());
|
||||
assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
|
||||
assertTrue(GraphCleaningFunctions.doCleanDate(" 2016-04-05").isPresent());
|
||||
assertNotNull(GraphCleaningFunctions.cleanDate("2016-05-07T12:41:19.202Z "));
|
||||
assertNotNull(GraphCleaningFunctions.cleanDate("2020-09-10 11:08:52 "));
|
||||
assertNotNull(GraphCleaningFunctions.cleanDate(" 2016-04-05"));
|
||||
|
||||
assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get());
|
||||
assertEquals("2016-04-05", GraphCleaningFunctions.cleanDate("2016 Apr 05"));
|
||||
|
||||
assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 2006").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 MST 2006").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get());
|
||||
assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get());
|
||||
assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get());
|
||||
assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get());
|
||||
assertEquals("2009-05-08", GraphCleaningFunctions.cleanDate("May 8, 2009 5:57:51 PM"));
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, 1970"));
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, '70"));
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 1970"));
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 70"));
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 2 15:04:05 2006"));
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 2 15:04:05 MST 2006"));
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 02 15:04:05 -0700 2006"));
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Monday, 02-Jan-06 15:04:05 MST"));
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 MST"));
|
||||
assertEquals("2017-07-11", GraphCleaningFunctions.cleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)"));
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 -0700"));
|
||||
assertEquals("2018-01-04", GraphCleaningFunctions.cleanDate("Thu, 4 Jan 2018 17:53:36 +0000"));
|
||||
assertEquals("2015-08-10", GraphCleaningFunctions.cleanDate("Mon Aug 10 15:44:11 UTC+0100 2015"));
|
||||
assertEquals(
|
||||
"2015-07-03",
|
||||
GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get());
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get());
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get());
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get());
|
||||
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get());
|
||||
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get());
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get());
|
||||
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get());
|
||||
assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get());
|
||||
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get());
|
||||
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get());
|
||||
assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get());
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get());
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get());
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get());
|
||||
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get());
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get());
|
||||
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get());
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get());
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get());
|
||||
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get());
|
||||
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get());
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get());
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
|
||||
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
|
||||
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
|
||||
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
|
||||
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
|
||||
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get());
|
||||
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get());
|
||||
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get());
|
||||
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get());
|
||||
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get());
|
||||
assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get());
|
||||
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get());
|
||||
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get());
|
||||
GraphCleaningFunctions.cleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)"));
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 10:09am"));
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 at 10:09am PST-08"));
|
||||
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012, 10:10:09"));
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7, 1970"));
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7th, 1970"));
|
||||
assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006, 19:17"));
|
||||
assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006 19:17"));
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 70"));
|
||||
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 1970"));
|
||||
assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("03 February 2013"));
|
||||
assertEquals("2013-07-01", GraphCleaningFunctions.cleanDate("1 July 2013"));
|
||||
assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("2013-Feb-03"));
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3/31/2014"));
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03/31/2014"));
|
||||
assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08/21/71"));
|
||||
assertEquals("1971-01-08", GraphCleaningFunctions.cleanDate("8/1/71"));
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/2014 22:05"));
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("04/08/2014 22:05"));
|
||||
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/14 22:05"));
|
||||
assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("04/2/2014 03:00:51"));
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00:00 AM"));
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00:01 PM"));
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00 PM"));
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 1:00 PM"));
|
||||
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00 AM"));
|
||||
assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("4/02/2014 03:00:51"));
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59"));
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59.3186369"));
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/3/31"));
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/03/31"));
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/4/8 22:05"));
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/04/08 22:05"));
|
||||
assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/04/2 03:00:51"));
|
||||
assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/4/02 03:00:51"));
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59"));
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59.3186369"));
|
||||
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014年04月08日"));
|
||||
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("2006-01-02T15:04:05+0000"));
|
||||
assertEquals("2009-08-13", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09-07:00"));
|
||||
assertEquals("2009-08-12", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09"));
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.3186369"));
|
||||
assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000"));
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.123"));
|
||||
assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43"));
|
||||
assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43:22"));
|
||||
assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 UTC"));
|
||||
assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 GMT"));
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 05:24:37 PM"));
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800"));
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800 +08"));
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:44 +09:00"));
|
||||
assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000 +0000 UTC"));
|
||||
assertEquals("2015-09-30", GraphCleaningFunctions.cleanDate("2015-09-30 18:48:56.35272715 +0000 UTC"));
|
||||
assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 GMT"));
|
||||
assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 UTC"));
|
||||
assertEquals(
|
||||
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get());
|
||||
"2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001"));
|
||||
assertEquals(
|
||||
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get());
|
||||
assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get());
|
||||
assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get());
|
||||
assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get());
|
||||
assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get());
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get());
|
||||
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get());
|
||||
assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get());
|
||||
assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get());
|
||||
assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get());
|
||||
assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get());
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get());
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get());
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get());
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get());
|
||||
"2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001"));
|
||||
assertEquals("2017-07-19", GraphCleaningFunctions.cleanDate("2017-07-19 03:21:51+00:00"));
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26"));
|
||||
assertEquals("2014-04-01", GraphCleaningFunctions.cleanDate("2014-04"));
|
||||
assertEquals("2014-01-01", GraphCleaningFunctions.cleanDate("2014"));
|
||||
assertEquals("2014-05-11", GraphCleaningFunctions.cleanDate("2014-05-11 08:20:13,787"));
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3.31.2014"));
|
||||
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03.31.2014"));
|
||||
assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08.21.71"));
|
||||
assertEquals("2014-03-01", GraphCleaningFunctions.cleanDate("2014.03"));
|
||||
assertEquals("2014-03-30", GraphCleaningFunctions.cleanDate("2014.03.30"));
|
||||
assertEquals("2014-06-01", GraphCleaningFunctions.cleanDate("20140601"));
|
||||
assertEquals("2014-07-22", GraphCleaningFunctions.cleanDate("20140722105203"));
|
||||
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("1332151919"));
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367189"));
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222"));
|
||||
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222333"));
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -172,6 +172,61 @@ public class PromoteActionPayloadForGraphTableJobTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldPromoteActionPayload_custom() throws Exception {
|
||||
|
||||
Class<? extends Oaf> rowClazz = Publication.class;
|
||||
Class<? extends Oaf> actionPayloadClazz = Result.class;
|
||||
MergeAndGet.Strategy strategy = MergeAndGet.Strategy.MERGE_FROM_AND_GET;
|
||||
|
||||
// given
|
||||
Path inputGraphTableDir = createGraphTable(inputGraphRootDir, rowClazz);
|
||||
Path inputActionPayloadDir = createActionPayload(inputActionPayloadRootDir, rowClazz, actionPayloadClazz);
|
||||
Path outputGraphTableDir = outputDir.resolve("graph").resolve(rowClazz.getSimpleName().toLowerCase());
|
||||
|
||||
// when
|
||||
PromoteActionPayloadForGraphTableJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputGraphTablePath",
|
||||
inputGraphTableDir.toString(),
|
||||
"-graphTableClassName",
|
||||
rowClazz.getCanonicalName(),
|
||||
"-inputActionPayloadPath",
|
||||
inputActionPayloadDir.toString(),
|
||||
"-actionPayloadClassName",
|
||||
actionPayloadClazz.getCanonicalName(),
|
||||
"-outputGraphTablePath",
|
||||
outputGraphTableDir.toString(),
|
||||
"-mergeAndGetStrategy",
|
||||
strategy.name(),
|
||||
"--shouldGroupById",
|
||||
"true"
|
||||
});
|
||||
|
||||
// then
|
||||
assertTrue(Files.exists(outputGraphTableDir));
|
||||
|
||||
List<? extends Oaf> actualOutputRows = readGraphTableFromJobOutput(outputGraphTableDir.toString(), rowClazz)
|
||||
.collectAsList()
|
||||
.stream()
|
||||
.sorted(Comparator.comparingInt(Object::hashCode))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
Publication p = actualOutputRows
|
||||
.stream()
|
||||
.map(o -> (Publication) o)
|
||||
.filter(o -> "50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879".equals(o.getId()))
|
||||
.findFirst()
|
||||
.get();
|
||||
|
||||
assertNotNull(p.getMeasures());
|
||||
assertTrue(p.getMeasures().size() > 0);
|
||||
|
||||
}
|
||||
|
||||
public static Stream<Arguments> promoteJobTestParams() {
|
||||
return Stream
|
||||
.of(
|
||||
|
|
|
@ -18,3 +18,4 @@
|
|||
{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018243405,"id":"50|CSC_________::00019460865d6cc381b36076131a5bc1","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"Computer Science::Networking and Internet Architecture","qualifier":{"classid":"arxiv","classname":"arxiv","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7416","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
|
||||
{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
|
||||
{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
|
||||
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"processingchargeamount":null,"processingchargecurrency":null,"measures":[{"id":"influence","unit":[{"key":"score","value":"1.64385446761e-08","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity_alt","unit":[{"key":"score","value":"18.9590813696","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity","unit":[{"key":"score","value":"6.00577981643e-08","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]}],"author":null,"resulttype":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":null}
|
|
@ -27,6 +27,8 @@ public class Constants {
|
|||
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
|
||||
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
|
||||
public static final String UPDATE_SUBJECT_SDG_CLASS_ID = "subject:sdg";
|
||||
public static final String UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID = "measure:usage_counts";
|
||||
public static final String UPDATE_KEY_USAGE_COUNTS = "count";
|
||||
|
||||
public static final String FOS_CLASS_ID = "FOS";
|
||||
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
|
||||
|
|
|
@ -0,0 +1,156 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.usagestats;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Measure;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* created the Atomic Action for each type of results
|
||||
*/
|
||||
public class SparkAtomicActionUsageJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionUsageJob.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static <I extends Result> void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkAtomicActionUsageJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||
|
||||
final String dbname = parser.get("usagestatsdb");
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
|
||||
runWithSparkHiveSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
prepareResults(dbname, spark, workingPath);
|
||||
writeActionSet(spark, workingPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
public static void prepareResults(String db, SparkSession spark, String workingPath) {
|
||||
spark
|
||||
.sql(
|
||||
"Select result_id, downloads, views " +
|
||||
"from " + db + ".usage_stats")
|
||||
.as(Encoders.bean(UsageStatsModel.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath);
|
||||
}
|
||||
|
||||
public static void writeActionSet(SparkSession spark, String inputPath, String outputPath) {
|
||||
readPath(spark, inputPath, UsageStatsModel.class)
|
||||
.groupByKey((MapFunction<UsageStatsModel, String>) us -> us.getResult_id(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, UsageStatsModel, Result>) (k, it) -> {
|
||||
UsageStatsModel first = it.next();
|
||||
it.forEachRemaining(us -> {
|
||||
first.setDownloads(first.getDownloads() + us.getDownloads());
|
||||
first.setViews(first.getViews() + us.getViews());
|
||||
});
|
||||
|
||||
Result res = new Result();
|
||||
res.setId("50|" + k);
|
||||
|
||||
res.setMeasures(getMeasure(first.getDownloads(), first.getViews()));
|
||||
return res;
|
||||
}, Encoders.bean(Result.class))
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(p.getClass(), p))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
|
||||
}
|
||||
|
||||
private static List<Measure> getMeasure(Long downloads, Long views) {
|
||||
DataInfo dataInfo = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
UPDATE_DATA_INFO_TYPE,
|
||||
true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID,
|
||||
UPDATE_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"");
|
||||
|
||||
return Arrays
|
||||
.asList(
|
||||
OafMapperUtils
|
||||
.newMeasureInstance("downloads", String.valueOf(downloads), UPDATE_KEY_USAGE_COUNTS, dataInfo),
|
||||
OafMapperUtils.newMeasureInstance("views", String.valueOf(views), UPDATE_KEY_USAGE_COUNTS, dataInfo));
|
||||
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.usagestats;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class UsageStatsModel implements Serializable {
|
||||
private String result_id;
|
||||
private Long downloads;
|
||||
private Long views;
|
||||
|
||||
public String getResult_id() {
|
||||
return result_id;
|
||||
}
|
||||
|
||||
public void setResult_id(String result_id) {
|
||||
this.result_id = result_id;
|
||||
}
|
||||
|
||||
public Long getDownloads() {
|
||||
return downloads;
|
||||
}
|
||||
|
||||
public void setDownloads(Long downloads) {
|
||||
this.downloads = downloads;
|
||||
}
|
||||
|
||||
public Long getViews() {
|
||||
return views;
|
||||
}
|
||||
|
||||
public void setViews(Long views) {
|
||||
this.views = views;
|
||||
}
|
||||
}
|
|
@ -19,6 +19,8 @@ import org.slf4j.LoggerFactory;
|
|||
import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
|
||||
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
|
||||
|
@ -114,6 +116,10 @@ public class CollectorWorker extends ReportingJob {
|
|||
return new OaiCollectorPlugin(clientParams);
|
||||
case rest_json2xml:
|
||||
return new RestCollectorPlugin(clientParams);
|
||||
case file:
|
||||
return new FileCollectorPlugin(fileSystem);
|
||||
case fileGZip:
|
||||
return new FileGZipCollectorPlugin(fileSystem);
|
||||
case other:
|
||||
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
|
||||
.ofNullable(api.getParams().get("other_plugin_type"))
|
||||
|
|
|
@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
|
|||
public interface CollectorPlugin {
|
||||
|
||||
enum NAME {
|
||||
oai, other, rest_json2xml;
|
||||
oai, other, rest_json2xml, file, fileGZip;
|
||||
|
||||
public enum OTHER_NAME {
|
||||
mdstore_mongodb_dump, mdstore_mongodb
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.file;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Optional;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.utils.XMLIterator;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class);
|
||||
|
||||
public static final String SPLIT_ON_ELEMENT = "splitOnElement";
|
||||
|
||||
private final FileSystem fileSystem;
|
||||
|
||||
public AbstractSplittedRecordPlugin(FileSystem fileSystem) {
|
||||
this.fileSystem = fileSystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
|
||||
|
||||
// get path to file
|
||||
final Path filePath = Optional
|
||||
.ofNullable(api.getBaseUrl())
|
||||
.map(Path::new)
|
||||
.orElseThrow(() -> new CollectorException("missing baseUrl"));
|
||||
|
||||
log.info("baseUrl: {}", filePath);
|
||||
|
||||
// check that path to file exists
|
||||
try {
|
||||
if (!fileSystem.exists(filePath)) {
|
||||
throw new CollectorException("path does not exist: " + filePath);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
|
||||
// get split element
|
||||
final String splitOnElement = Optional
|
||||
.ofNullable(api.getParams().get(SPLIT_ON_ELEMENT))
|
||||
.orElseThrow(
|
||||
() -> new CollectorException(String
|
||||
.format("missing parameter '%s', required by the AbstractSplittedRecordPlugin", SPLIT_ON_ELEMENT)));
|
||||
|
||||
log.info("splitOnElement: {}", splitOnElement);
|
||||
|
||||
final BufferedInputStream bis = getBufferedInputStream(filePath);
|
||||
|
||||
Iterator<String> xmlIterator = new XMLIterator(splitOnElement, bis);
|
||||
|
||||
return StreamSupport
|
||||
.stream(
|
||||
Spliterators.spliteratorUnknownSize(xmlIterator, Spliterator.ORDERED),
|
||||
false);
|
||||
}
|
||||
|
||||
abstract protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException;
|
||||
|
||||
public FileSystem getFileSystem() {
|
||||
return fileSystem;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.file;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(FileCollectorPlugin.class);
|
||||
|
||||
public FileCollectorPlugin(FileSystem fileSystem) {
|
||||
super(fileSystem);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {
|
||||
|
||||
log.info("filePath: {}", filePath);
|
||||
|
||||
try {
|
||||
FileSystem fs = super.getFileSystem();
|
||||
return new BufferedInputStream(fs.open(filePath));
|
||||
} catch (Exception e) {
|
||||
throw new CollectorException("Error reading file " + filePath, e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.file;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPlugin.class);
|
||||
|
||||
public FileGZipCollectorPlugin(FileSystem fileSystem) {
|
||||
super(fileSystem);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {
|
||||
|
||||
log.info("filePath: {}", filePath);
|
||||
|
||||
try {
|
||||
FileSystem fs = super.getFileSystem();
|
||||
GZIPInputStream stream = new GZIPInputStream(fs.open(filePath));
|
||||
return new BufferedInputStream(stream);
|
||||
} catch (Exception e) {
|
||||
throw new CollectorException("Error reading file " + filePath, e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -19,7 +19,7 @@ import org.dom4j.io.XMLWriter;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.collection.XmlCleaner;
|
||||
import eu.dnetlib.dhp.collection.plugin.utils.XmlCleaner;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpConnector2;
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.w3c.dom.Node;
|
|||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import eu.dnetlib.dhp.collection.JsonUtils;
|
||||
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection;
|
||||
package eu.dnetlib.dhp.collection.plugin.utils;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
|
@ -0,0 +1,177 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.utils;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringWriter;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.util.Iterator;
|
||||
|
||||
import javax.xml.stream.XMLEventFactory;
|
||||
import javax.xml.stream.XMLEventReader;
|
||||
import javax.xml.stream.XMLEventWriter;
|
||||
import javax.xml.stream.XMLInputFactory;
|
||||
import javax.xml.stream.XMLOutputFactory;
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
import javax.xml.stream.events.StartElement;
|
||||
import javax.xml.stream.events.XMLEvent;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
public class XMLIterator implements Iterator<String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(XMLIterator.class);
|
||||
|
||||
private ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() {
|
||||
|
||||
@Override
|
||||
protected XMLInputFactory initialValue() {
|
||||
return XMLInputFactory.newInstance();
|
||||
}
|
||||
};
|
||||
|
||||
private ThreadLocal<XMLOutputFactory> outputFactory = new ThreadLocal<XMLOutputFactory>() {
|
||||
|
||||
@Override
|
||||
protected XMLOutputFactory initialValue() {
|
||||
return XMLOutputFactory.newInstance();
|
||||
}
|
||||
};
|
||||
|
||||
private ThreadLocal<XMLEventFactory> eventFactory = new ThreadLocal<XMLEventFactory>() {
|
||||
|
||||
@Override
|
||||
protected XMLEventFactory initialValue() {
|
||||
return XMLEventFactory.newInstance();
|
||||
}
|
||||
};
|
||||
|
||||
public static final String UTF_8 = "UTF-8";
|
||||
|
||||
final XMLEventReader parser;
|
||||
|
||||
private XMLEvent current = null;
|
||||
|
||||
private String element;
|
||||
|
||||
private InputStream inputStream;
|
||||
|
||||
public XMLIterator(final String element, final InputStream inputStream) {
|
||||
super();
|
||||
this.element = element;
|
||||
this.inputStream = inputStream;
|
||||
this.parser = getParser();
|
||||
try {
|
||||
this.current = findElement(parser);
|
||||
} catch (XMLStreamException e) {
|
||||
log.warn("cannot init parser position. No element found: " + element);
|
||||
current = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return current != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
String result = null;
|
||||
try {
|
||||
result = copy(parser);
|
||||
current = findElement(parser);
|
||||
return result;
|
||||
} catch (XMLStreamException e) {
|
||||
throw new RuntimeException(String.format("error copying xml, built so far: '%s'", result), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@SuppressWarnings("finally")
|
||||
private String copy(final XMLEventReader parser) throws XMLStreamException {
|
||||
final StringWriter result = new StringWriter();
|
||||
try {
|
||||
final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(result);
|
||||
final StartElement start = current.asStartElement();
|
||||
final StartElement newRecord = eventFactory
|
||||
.get()
|
||||
.createStartElement(start.getName(), start.getAttributes(), start.getNamespaces());
|
||||
|
||||
// new root record
|
||||
writer.add(newRecord);
|
||||
|
||||
// copy the rest as it is
|
||||
while (parser.hasNext()) {
|
||||
final XMLEvent event = parser.nextEvent();
|
||||
|
||||
// TODO: replace with depth tracking instead of close tag tracking.
|
||||
if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
|
||||
writer.add(event);
|
||||
break;
|
||||
}
|
||||
|
||||
writer.add(event);
|
||||
}
|
||||
writer.close();
|
||||
} finally {
|
||||
return result.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks for the next occurrence of the splitter element.
|
||||
*
|
||||
* @param parser
|
||||
* @return
|
||||
* @throws XMLStreamException
|
||||
*/
|
||||
private XMLEvent findElement(final XMLEventReader parser) throws XMLStreamException {
|
||||
|
||||
/*
|
||||
* if (current != null && element.equals(current.asStartElement().getName().getLocalPart())) { return current; }
|
||||
*/
|
||||
|
||||
XMLEvent peek = parser.peek();
|
||||
if (peek != null && peek.isStartElement()) {
|
||||
String name = peek.asStartElement().getName().getLocalPart();
|
||||
if (element.equals(name)) {
|
||||
return peek;
|
||||
}
|
||||
}
|
||||
|
||||
while (parser.hasNext()) {
|
||||
final XMLEvent event = parser.nextEvent();
|
||||
if (event != null && event.isStartElement()) {
|
||||
String name = event.asStartElement().getName().getLocalPart();
|
||||
if (element.equals(name)) {
|
||||
return event;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private XMLEventReader getParser() {
|
||||
try {
|
||||
return inputFactory.get().createXMLEventReader(sanitize(inputStream));
|
||||
} catch (XMLStreamException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private Reader sanitize(final InputStream in) {
|
||||
final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
|
||||
charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
|
||||
charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
|
||||
return new InputStreamReader(in, charsetDecoder);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection;
|
||||
package eu.dnetlib.dhp.collection.plugin.utils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "hmu",
|
||||
"paramLongName": "hive_metastore_uris",
|
||||
"paramDescription": "the URI for the hive metastore",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "sdb",
|
||||
"paramLongName": "usagestatsdb",
|
||||
"paramDescription": "the name of the db to be used",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the workingPath where to save the content of the usage_stats table",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,30 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,99 @@
|
|||
<workflow-app name="UsageStatsCounts" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>usagestatsdb</name>
|
||||
<description>the name of the db to be used</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
<start to="atomicactions"/>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="atomicactions">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the usage stats count for results</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.usagestats.SparkAtomicActionUsageJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--usagestatsdb</arg><arg>${usagestatsdb}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/usageDb</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -47,13 +47,18 @@ object DataciteToOAFTransformation {
|
|||
}
|
||||
|
||||
/** This method should skip record if json contains invalid text
|
||||
* defined in gile datacite_filter
|
||||
* defined in file datacite_filter
|
||||
*
|
||||
* @param json
|
||||
* @param record : unparsed datacite record
|
||||
* @param json : parsed record
|
||||
* @return True if the record should be skipped
|
||||
*/
|
||||
def skip_record(json: String): Boolean = {
|
||||
datacite_filter.exists(f => json.contains(f))
|
||||
def skip_record(record: String, json: org.json4s.JValue): Boolean = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
datacite_filter.exists(f => record.contains(f)) || (json \\ "publisher")
|
||||
.extractOrElse[String]("")
|
||||
.equalsIgnoreCase("FAIRsharing")
|
||||
|
||||
}
|
||||
|
||||
@deprecated("this method will be removed", "dhp")
|
||||
|
@ -304,12 +309,13 @@ object DataciteToOAFTransformation {
|
|||
vocabularies: VocabularyGroup,
|
||||
exportLinks: Boolean
|
||||
): List[Oaf] = {
|
||||
if (skip_record(input))
|
||||
return List()
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
if (skip_record(input, json))
|
||||
return List()
|
||||
|
||||
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
||||
val resourceTypeGeneral =
|
||||
(json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||
|
|
|
@ -0,0 +1,256 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.usagestats;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class SparkAtomicActionCountJobTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(SparkAtomicActionCountJobTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(SparkAtomicActionCountJobTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(SparkAtomicActionCountJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkAtomicActionCountJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
void testMatch() {
|
||||
String usageScoresPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/usagestats/usagestatsdb")
|
||||
.getPath();
|
||||
|
||||
SparkAtomicActionUsageJob.writeActionSet(spark, usageScoresPath, workingDir.toString() + "/actionSet");
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Result> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.map(usm -> OBJECT_MAPPER.readValue(usm._2.getBytes(), AtomicAction.class))
|
||||
.map(aa -> (Result) aa.getPayload());
|
||||
|
||||
Assertions.assertEquals(9, tmp.count());
|
||||
|
||||
tmp.foreach(r -> Assertions.assertEquals(2, r.getMeasures().size()));
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.forEach(u -> Assertions.assertFalse(u.getDataInfo().getDeletedbyinference()))));
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m.getUnit().stream().forEach(u -> Assertions.assertTrue(u.getDataInfo().getInferred()))));
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.forEach(u -> Assertions.assertFalse(u.getDataInfo().getInvisible()))));
|
||||
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.forEach(
|
||||
u -> Assertions
|
||||
.assertEquals(
|
||||
"measure:usage_counts",
|
||||
u.getDataInfo().getProvenanceaction().getClassid()))));
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.forEach(
|
||||
u -> Assertions
|
||||
.assertEquals(
|
||||
"Inferred by OpenAIRE",
|
||||
u.getDataInfo().getProvenanceaction().getClassname()))));
|
||||
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.forEach(
|
||||
u -> Assertions
|
||||
.assertEquals(
|
||||
"count",
|
||||
u.getKey()))));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, tmp.filter(r -> r.getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6")).count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("downloads"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"5",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("views"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|doi_________::17eda2ff77407538fbe5d3d719b9d1c0"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("downloads"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"1",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|doi_________::17eda2ff77407538fbe5d3d719b9d1c0"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("views"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"2",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|doi_________::3085e4c6e051378ca6157fe7f0430c1f"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("downloads"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"6",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|doi_________::3085e4c6e051378ca6157fe7f0430c1f"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("views"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.file;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocalFileSystem;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import net.bytebuddy.asm.Advice;
|
||||
|
||||
public class FileCollectorPluginTest {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
|
||||
|
||||
private final ApiDescriptor api = new ApiDescriptor();
|
||||
|
||||
private FileCollectorPlugin plugin;
|
||||
|
||||
private static final String SPLIT_ON_ELEMENT = "repository";
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
|
||||
final String gzipFile = this
|
||||
.getClass()
|
||||
.getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml")
|
||||
.getFile();
|
||||
|
||||
api.setBaseUrl(gzipFile);
|
||||
|
||||
HashMap<String, String> params = new HashMap<>();
|
||||
params.put("splitOnElement", SPLIT_ON_ELEMENT);
|
||||
|
||||
api.setParams(params);
|
||||
|
||||
FileSystem fs = FileSystem.get(new Configuration());
|
||||
plugin = new FileCollectorPlugin(fs);
|
||||
}
|
||||
|
||||
@Test
|
||||
void test() throws CollectorException {
|
||||
|
||||
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
|
||||
|
||||
stream.limit(10).forEach(s -> {
|
||||
Assertions.assertTrue(s.length() > 0);
|
||||
log.info(s);
|
||||
});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.file;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.HashMap;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocalFileSystem;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class FileGZipCollectorPluginTest {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
|
||||
|
||||
private final ApiDescriptor api = new ApiDescriptor();
|
||||
|
||||
private FileGZipCollectorPlugin plugin;
|
||||
|
||||
private static final String SPLIT_ON_ELEMENT = "repository";
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
|
||||
final String gzipFile = Objects
|
||||
.requireNonNull(
|
||||
this
|
||||
.getClass()
|
||||
.getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz"))
|
||||
.getFile();
|
||||
|
||||
api.setBaseUrl(gzipFile);
|
||||
|
||||
HashMap<String, String> params = new HashMap<>();
|
||||
params.put("splitOnElement", SPLIT_ON_ELEMENT);
|
||||
|
||||
api.setParams(params);
|
||||
|
||||
FileSystem fs = FileSystem.get(new Configuration());
|
||||
plugin = new FileGZipCollectorPlugin(fs);
|
||||
}
|
||||
|
||||
@Test
|
||||
void test() throws CollectorException {
|
||||
|
||||
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
|
||||
|
||||
stream.limit(10).forEach(s -> {
|
||||
Assertions.assertTrue(s.length() > 0);
|
||||
log.info(s);
|
||||
});
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,12 @@
|
|||
{"result_id":"dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6","downloads":0,"views":4}
|
||||
{"result_id":"dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6","downloads":0,"views":1}
|
||||
{"result_id":"doi_________::17eda2ff77407538fbe5d3d719b9d1c0","downloads":0,"views":1}
|
||||
{"result_id":"doi_________::1d4dc08605fd0a2be1105d30c63bfea1","downloads":1,"views":3}
|
||||
{"result_id":"doi_________::2e3527822854ca9816f6dfea5bff61a8","downloads":1,"views":1}
|
||||
{"result_id":"doi_________::3085e4c6e051378ca6157fe7f0430c1f","downloads":2,"views":3}
|
||||
{"result_id":"doi_________::3085e4c6e051378ca6157fe7f0430c1f","downloads":0,"views":3}
|
||||
{"result_id":"doi_________::33f710e6dd30cc5e67e35b371ddc33cf","downloads":0,"views":1}
|
||||
{"result_id":"doi_________::39738ebf10654732dd3a7af9f24655f8","downloads":1,"views":3}
|
||||
{"result_id":"doi_________::3c3b65f07c1a06c7894397eda1d11bbf","downloads":1,"views":8}
|
||||
{"result_id":"doi_________::3c3b65f07c1a06c7894397eda1d11bbf","downloads":0,"views":2}
|
||||
{"result_id":"doi_________::4938a71a884dd481d329657aa543b850","downloads":0,"views":3}
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -107,4 +107,19 @@ class DataciteToOAFTest extends AbstractVocabularyTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testFilter(): Unit = {
|
||||
val record = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record_fairsharing.json")
|
||||
)
|
||||
.mkString
|
||||
|
||||
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||
val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
|
||||
|
||||
assertTrue(res.isEmpty)
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -59,54 +59,6 @@ object SparkGenerateDoiBoost {
|
|||
val workingDirPath = parser.get("workingPath")
|
||||
val openaireOrganizationPath = parser.get("openaireOrganizationPath")
|
||||
|
||||
val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication] with Serializable {
|
||||
override def zero: Publication = null
|
||||
|
||||
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||
|
||||
if (b == null) {
|
||||
if (a != null && a._2 != null) {
|
||||
a._2.setId(a._1)
|
||||
return a._2
|
||||
}
|
||||
} else {
|
||||
if (a != null && a._2 != null) {
|
||||
b.mergeOAFDataInfo(a._2)
|
||||
b.mergeFrom(a._2)
|
||||
b.setId(a._1)
|
||||
val authors = AuthorMerger.mergeAuthor(b.getAuthor, a._2.getAuthor)
|
||||
b.setAuthor(authors)
|
||||
return b
|
||||
}
|
||||
}
|
||||
new Publication
|
||||
}
|
||||
|
||||
override def merge(b1: Publication, b2: Publication): Publication = {
|
||||
if (b1 == null) {
|
||||
if (b2 != null)
|
||||
return b2
|
||||
} else {
|
||||
if (b2 != null) {
|
||||
b1.mergeOAFDataInfo(b2)
|
||||
b1.mergeFrom(b2)
|
||||
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
|
||||
b1.setAuthor(authors)
|
||||
if (b2.getId != null && b2.getId.nonEmpty)
|
||||
b1.setId(b2.getId)
|
||||
return b1
|
||||
}
|
||||
}
|
||||
new Publication
|
||||
}
|
||||
|
||||
override def finish(reduction: Publication): Publication = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
|
||||
override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
}
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
|
||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
|
|
|
@ -2,7 +2,7 @@ package eu.dnetlib.doiboost.crossref
|
|||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||
|
@ -576,14 +576,19 @@ case object Crossref2Oaf {
|
|||
|
||||
def extractDate(dt: String, datePart: List[List[Int]]): String = {
|
||||
if (StringUtils.isNotBlank(dt))
|
||||
return dt
|
||||
return GraphCleaningFunctions.cleanDate(dt)
|
||||
if (datePart != null && datePart.size == 1) {
|
||||
val res = datePart.head
|
||||
if (res.size == 3) {
|
||||
val dp = f"${res.head}-${res(1)}%02d-${res(2)}%02d"
|
||||
if (dp.length == 10) {
|
||||
return dp
|
||||
return GraphCleaningFunctions.cleanDate(dp)
|
||||
}
|
||||
} else if (res.size == 2) {
|
||||
val dp = f"${res.head}-${res(1)}%02d-01"
|
||||
return GraphCleaningFunctions.cleanDate(dp)
|
||||
} else if (res.size == 1) {
|
||||
return GraphCleaningFunctions.cleanDate(s"${res.head}-01-01")
|
||||
}
|
||||
}
|
||||
null
|
||||
|
|
|
@ -0,0 +1,368 @@
|
|||
|
||||
{
|
||||
|
||||
"indexed": {
|
||||
"date-parts": [
|
||||
[
|
||||
2022,
|
||||
4,
|
||||
22
|
||||
]
|
||||
],
|
||||
"date-time": "2022-04-22T15:30:54Z",
|
||||
"timestamp": 1650641454218
|
||||
},
|
||||
"reference-count": 31,
|
||||
"publisher": "Cambridge University Press (CUP)",
|
||||
"issue": "2",
|
||||
"license": [
|
||||
{
|
||||
"start": {
|
||||
"date-parts": [
|
||||
[
|
||||
2017,
|
||||
8,
|
||||
22
|
||||
]
|
||||
],
|
||||
"date-time": "2017-08-22T00:00:00Z",
|
||||
"timestamp": 1503360000000
|
||||
},
|
||||
"content-version": "unspecified",
|
||||
"delay-in-days": 21,
|
||||
"URL": "https://www.cambridge.org/core/terms"
|
||||
}
|
||||
],
|
||||
"content-domain": {
|
||||
"domain": [
|
||||
|
||||
],
|
||||
"crossmark-restriction": false
|
||||
},
|
||||
"short-container-title": [
|
||||
"Dance Res. J."
|
||||
],
|
||||
"published-print": {
|
||||
"date-parts": [
|
||||
[
|
||||
2017,
|
||||
8
|
||||
]
|
||||
]
|
||||
},
|
||||
"abstract": "<jats:p>Gaga, a practice developed by Israeli choreographer Ohad Naharin, is one of the most popular training methods on the global dance market. Structured as a metatechnique, or a system for negotiating techniques within one's body, Gaga teaches students to both draw on and reject multiple movement techniques to create their own movement. I consider how the paradigms of choreography, technique, and improvisation are blurred together in the pedagogical model of a metatechnique and how training dancers to shift between choreographer, dancer, and improviser has significant ramifications for understanding their agency. The metatechnique model of Gaga falls in line with neoliberal values of efficiency and a wide range of skills and knowledge; this analysis provides an understanding of recent trends in dance training in relation to contemporary political and socioeconomic structures.</jats:p>",
|
||||
"DOI": "10.1017/s0149767717000183",
|
||||
"type": "journal-article",
|
||||
"created": {
|
||||
"date-parts": [
|
||||
[
|
||||
2017,
|
||||
8,
|
||||
22
|
||||
]
|
||||
],
|
||||
"date-time": "2017-08-22T13:32:38Z",
|
||||
"timestamp": 1503408758000
|
||||
},
|
||||
"page": "26-43",
|
||||
"source": "Crossref",
|
||||
"is-referenced-by-count": 10,
|
||||
"title": [
|
||||
"Gaga as Metatechnique: Negotiating Choreography, Improvisation, and Technique in a Neoliberal Dance Market"
|
||||
],
|
||||
"prefix": "10.1017",
|
||||
"volume": "49",
|
||||
"author": [
|
||||
{
|
||||
"given": "Meghan",
|
||||
"family": "Quinlan",
|
||||
"sequence": "first",
|
||||
"affiliation": [
|
||||
|
||||
]
|
||||
}
|
||||
],
|
||||
"member": "56",
|
||||
"published-online": {
|
||||
"date-parts": [
|
||||
[
|
||||
2017,
|
||||
8,
|
||||
22
|
||||
]
|
||||
]
|
||||
},
|
||||
"reference": [
|
||||
{
|
||||
"key": "S0149767717000183_ref22",
|
||||
"first-page": "38",
|
||||
"article-title": "‘I Don't Want to do African … What About My Technique?’: Transforming Dancing Places into Spaces in the Academy",
|
||||
"volume": "4",
|
||||
"author": "Monroe",
|
||||
"year": "2011",
|
||||
"journal-title": "The Journal of Pan African Studies"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref8",
|
||||
"volume-title": "Discipline and Punish: The Birth of the Prison",
|
||||
"author": "Foucault",
|
||||
"year": "1995"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref24",
|
||||
"volume-title": "Sharing the Dance: Contact Improvisation and American Culture",
|
||||
"author": "Novack",
|
||||
"year": "1990"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref26",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.1215/9780822387879"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref10",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.1080/01472526.2015.1085759"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref4",
|
||||
"doi-asserted-by": "crossref",
|
||||
"first-page": "235",
|
||||
"volume-title": "Meaning in Motion",
|
||||
"author": "Foster",
|
||||
"year": "1997",
|
||||
"DOI": "10.1215/9780822397281-013"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref17",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.1093/acprof:oso/9780199360369.001.0001"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref30",
|
||||
"volume-title": "The Precariat: The New Dangerous Class",
|
||||
"author": "Standing",
|
||||
"year": "2011"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref7",
|
||||
"volume-title": "Choreographing Empathy: Kinesthesia in Performance",
|
||||
"author": "Foster",
|
||||
"year": "2011"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref14",
|
||||
"volume-title": "A Brief History of Neoliberalism",
|
||||
"author": "Harvey",
|
||||
"year": "2007"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref27",
|
||||
"unstructured": "Quinlan Meghan . 2016. “Gaga as Politics: A Case Study of Contemporary Dance Training.” PhD diss., University of California, Riverside."
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref11",
|
||||
"first-page": "xiii",
|
||||
"volume-title": "Taken by Surprise: A Dance Improvisation Reader",
|
||||
"author": "Gere",
|
||||
"year": "2003"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref13",
|
||||
"volume-title": "Declaration",
|
||||
"author": "Hardt",
|
||||
"year": "2012"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref29",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.1057/9780230236844_9"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref5",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.1057/9780230236844_6"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref28",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.1017/S0149767700000528"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref12",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.3998/mpub.287881"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref18",
|
||||
"first-page": "135",
|
||||
"volume-title": "Taken by Surprise: A Dance Improvisation Reader",
|
||||
"author": "Marks",
|
||||
"year": "2003"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref3",
|
||||
"doi-asserted-by": "crossref",
|
||||
"volume-title": "Bodies That Matter: On the Discursive Limits of Sex",
|
||||
"author": "Butler",
|
||||
"year": "2011",
|
||||
"DOI": "10.4324/9780203828274"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref32",
|
||||
"volume-title": "Choreographing Difference: The Body and Identity in Contemporary Dance",
|
||||
"author": "Albright",
|
||||
"year": "1997"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref16",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.1017/S0149767714000163"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref15",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.2307/1477803"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref21",
|
||||
"first-page": "455",
|
||||
"volume-title": "Incorporations",
|
||||
"author": "Mauss",
|
||||
"year": "1992"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref1",
|
||||
"volume-title": "The Body Eclectic: Evolving Practices in Dance Training",
|
||||
"author": "Bales",
|
||||
"year": "2008"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref9",
|
||||
"unstructured": "Gaga Movement Ltd. 2016. Gaga People. Dancers. Accessed June 28, 2016. http://gagapeople.com/english/."
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref19",
|
||||
"volume-title": "Critical Moves: Dance Studies in Theory and Politics",
|
||||
"author": "Martin",
|
||||
"year": "1998"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref2",
|
||||
"volume-title": "Undoing the Demos: Neoliberalism's Stealth Revolution",
|
||||
"author": "Brown",
|
||||
"year": "2015"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref20",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.1162/DRAM_a_00214"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref6",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.1215/01610775-2009-016"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref23",
|
||||
"doi-asserted-by": "publisher",
|
||||
"DOI": "10.1093/acprof:oso/9780190201661.001.0001"
|
||||
},
|
||||
{
|
||||
"key": "S0149767717000183_ref25",
|
||||
"volume-title": "Flexible Citizenship: The Cultural Logics of Transnationality",
|
||||
"author": "Ong",
|
||||
"year": "1999"
|
||||
}
|
||||
],
|
||||
"container-title": [
|
||||
"Dance Research Journal"
|
||||
],
|
||||
"original-title": [
|
||||
|
||||
],
|
||||
"language": "en",
|
||||
"link": [
|
||||
{
|
||||
"URL": "https://www.cambridge.org/core/services/aop-cambridge-core/content/view/S0149767717000183",
|
||||
"content-type": "unspecified",
|
||||
"content-version": "vor",
|
||||
"intended-application": "similarity-checking"
|
||||
}
|
||||
],
|
||||
"deposited": {
|
||||
"date-parts": [
|
||||
[
|
||||
2020,
|
||||
10,
|
||||
16
|
||||
]
|
||||
],
|
||||
"date-time": "2020-10-16T01:00:32Z",
|
||||
"timestamp": 1602810032000
|
||||
},
|
||||
"score": 1,
|
||||
"resource": {
|
||||
"primary": {
|
||||
"URL": "https://www.cambridge.org/core/product/identifier/S0149767717000183/type/journal_article"
|
||||
}
|
||||
},
|
||||
"subtitle": [
|
||||
|
||||
],
|
||||
"short-title": [
|
||||
|
||||
],
|
||||
"issued": {
|
||||
"date-parts": [
|
||||
[
|
||||
2017,
|
||||
8
|
||||
]
|
||||
]
|
||||
},
|
||||
"references-count": 31,
|
||||
"journal-issue": {
|
||||
"issue": "2",
|
||||
"published-print": {
|
||||
"date-parts": [
|
||||
[
|
||||
2017,
|
||||
8
|
||||
]
|
||||
]
|
||||
}
|
||||
},
|
||||
"alternative-id": [
|
||||
"S0149767717000183"
|
||||
],
|
||||
"URL": "http://dx.doi.org/10.1017/s0149767717000183",
|
||||
"relation": {
|
||||
|
||||
},
|
||||
"ISSN": [
|
||||
"0149-7677",
|
||||
"1940-509X"
|
||||
],
|
||||
"issn-type": [
|
||||
{
|
||||
"value": "0149-7677",
|
||||
"type": "print"
|
||||
},
|
||||
{
|
||||
"value": "1940-509X",
|
||||
"type": "electronic"
|
||||
}
|
||||
],
|
||||
"subject": [
|
||||
"Visual Arts and Performing Arts"
|
||||
],
|
||||
"published": {
|
||||
"date-parts": [
|
||||
[
|
||||
2017,
|
||||
8
|
||||
]
|
||||
]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,330 @@
|
|||
{
|
||||
"indexed":{
|
||||
"date-parts":[
|
||||
[
|
||||
2022,
|
||||
4,
|
||||
14
|
||||
]
|
||||
],
|
||||
"date-time":"2022-04-14T11:27:30Z",
|
||||
"timestamp":1649935650109
|
||||
},
|
||||
"reference-count":22,
|
||||
"publisher":"SAGE Publications",
|
||||
"issue":"2",
|
||||
"license":[
|
||||
{
|
||||
"start":{
|
||||
"date-parts":[
|
||||
[
|
||||
1980,
|
||||
4,
|
||||
1
|
||||
]
|
||||
],
|
||||
"date-time":"1980-04-01T00:00:00Z",
|
||||
"timestamp":323395200000
|
||||
},
|
||||
"content-version":"tdm",
|
||||
"delay-in-days":0,
|
||||
"URL":"http:\/\/journals.sagepub.com\/page\/policies\/text-and-data-mining-license"
|
||||
}
|
||||
],
|
||||
"content-domain":{
|
||||
"domain":[
|
||||
|
||||
],
|
||||
"crossmark-restriction":false
|
||||
},
|
||||
"short-container-title":[
|
||||
"Perception"
|
||||
],
|
||||
"published-print":{
|
||||
"date-parts":[
|
||||
[
|
||||
1980,
|
||||
4
|
||||
]
|
||||
]
|
||||
},
|
||||
"abstract":"<jats:p> To answer the question \u2018What is suppressed during binocular rivalry?\u2019 a series of three experiments was performed. In the first experiment observers viewed binocular rivalry between orthogonally oriented patterns. When the dominant and suppressed patterns were interchanged between the eyes observers continued seeing with the dominant eye, indicating that an eye, not a pattern, is suppressed during rivalry. In a second experiment it was found that a suppressed eye was able to contribute to stereopsis. A third experiment demonstrated that the predominance of an eye could be influenced by prior adaptation of the other eye, indicating that binocular mechanisms participate in the rivalry process. <\/jats:p>",
|
||||
"DOI":"10.1068\/p090223",
|
||||
"type":"journal-article",
|
||||
"created":{
|
||||
"date-parts":[
|
||||
[
|
||||
2007,
|
||||
1,
|
||||
23
|
||||
]
|
||||
],
|
||||
"date-time":"2007-01-23T15:21:36Z",
|
||||
"timestamp":1169565696000
|
||||
},
|
||||
"page":"223-231",
|
||||
"source":"Crossref",
|
||||
"is-referenced-by-count":123,
|
||||
"title":[
|
||||
"What is Suppressed during Binocular Rivalry?"
|
||||
],
|
||||
"prefix":"10.1177",
|
||||
"volume":"9",
|
||||
"author":[
|
||||
{
|
||||
"given":"Randolph",
|
||||
"family":"Blake",
|
||||
"sequence":"first",
|
||||
"affiliation":[
|
||||
{
|
||||
"name":"Cresap Neuroscience Laboratory, Northwestern University, Evanston, Illinois 60201, USA"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"given":"David H",
|
||||
"family":"Westendorf",
|
||||
"sequence":"additional",
|
||||
"affiliation":[
|
||||
{
|
||||
"name":"Department of Psychology, University of Arkansas, Fayetteville, Arkansas 72701, USA"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"given":"Randall",
|
||||
"family":"Overton",
|
||||
"sequence":"additional",
|
||||
"affiliation":[
|
||||
{
|
||||
"name":"Department of Psychology, Illinois State University, Normal, Illinois 61761, USA"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"member":"179",
|
||||
"published-online":{
|
||||
"date-parts":[
|
||||
[
|
||||
2016,
|
||||
6,
|
||||
25
|
||||
]
|
||||
]
|
||||
},
|
||||
"reference":[
|
||||
{
|
||||
"key":"bibr1-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1136\/bjo.37.1.37"
|
||||
},
|
||||
{
|
||||
"key":"bibr2-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1037\/0096-1523.5.2.315"
|
||||
},
|
||||
{
|
||||
"key":"bibr3-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1016\/0042-6989(74)90065-0"
|
||||
},
|
||||
{
|
||||
"key":"bibr4-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1068\/p080143"
|
||||
},
|
||||
{
|
||||
"key":"bibr5-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1016\/0042-6989(70)90036-2"
|
||||
},
|
||||
{
|
||||
"key":"bibr6-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1113\/jphysiol.1969.sp008862"
|
||||
},
|
||||
{
|
||||
"key":"bibr7-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1113\/jphysiol.1972.sp010006"
|
||||
},
|
||||
{
|
||||
"key":"bibr8-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1146\/annurev.ps.23.020172.002213"
|
||||
},
|
||||
{
|
||||
"key":"bibr9-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1126\/science.166.3902.245"
|
||||
},
|
||||
{
|
||||
"key":"bibr10-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1037\/h0075805"
|
||||
},
|
||||
{
|
||||
"key":"bibr11-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1113\/jphysiol.1968.sp008552"
|
||||
},
|
||||
{
|
||||
"key":"bibr12-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1113\/jphysiol.1965.sp007784"
|
||||
},
|
||||
{
|
||||
"key":"bibr13-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1037\/h0032455"
|
||||
},
|
||||
{
|
||||
"key":"bibr14-p090223",
|
||||
"volume-title":"Treatise on Physiological Optics",
|
||||
"volume":"3",
|
||||
"author":"von Helmholtz H",
|
||||
"year":"1866",
|
||||
"edition":"3"
|
||||
},
|
||||
{
|
||||
"key":"bibr15-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1068\/p040125"
|
||||
},
|
||||
{
|
||||
"key":"bibr16-p090223",
|
||||
"volume-title":"On Binocular Rivalry",
|
||||
"author":"Levelt W J M",
|
||||
"year":"1965"
|
||||
},
|
||||
{
|
||||
"key":"bibr17-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1001\/archopht.1935.00840020011001"
|
||||
},
|
||||
{
|
||||
"key":"bibr18-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.3758\/BF03205796"
|
||||
},
|
||||
{
|
||||
"key":"bibr19-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.3758\/BF03210180"
|
||||
},
|
||||
{
|
||||
"key":"bibr20-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1037\/0033-2909.85.2.376"
|
||||
},
|
||||
{
|
||||
"key":"bibr21-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.1016\/0042-6989(79)90169-X"
|
||||
},
|
||||
{
|
||||
"key":"bibr22-p090223",
|
||||
"doi-asserted-by":"publisher",
|
||||
"DOI":"10.3758\/BF03210465"
|
||||
}
|
||||
],
|
||||
"container-title":[
|
||||
"Perception"
|
||||
],
|
||||
"original-title":[
|
||||
|
||||
],
|
||||
"language":"en",
|
||||
"link":[
|
||||
{
|
||||
"URL":"http:\/\/journals.sagepub.com\/doi\/pdf\/10.1068\/p090223",
|
||||
"content-type":"application\/pdf",
|
||||
"content-version":"vor",
|
||||
"intended-application":"text-mining"
|
||||
},
|
||||
{
|
||||
"URL":"http:\/\/journals.sagepub.com\/doi\/pdf\/10.1068\/p090223",
|
||||
"content-type":"unspecified",
|
||||
"content-version":"vor",
|
||||
"intended-application":"similarity-checking"
|
||||
}
|
||||
],
|
||||
"deposited":{
|
||||
"date-parts":[
|
||||
[
|
||||
2021,
|
||||
12,
|
||||
3
|
||||
]
|
||||
],
|
||||
"date-time":"2021-12-03T11:49:48Z",
|
||||
"timestamp":1638532188000
|
||||
},
|
||||
"score":1,
|
||||
"resource":{
|
||||
"primary":{
|
||||
"URL":"http:\/\/journals.sagepub.com\/doi\/10.1068\/p090223"
|
||||
}
|
||||
},
|
||||
"subtitle":[
|
||||
|
||||
],
|
||||
"short-title":[
|
||||
|
||||
],
|
||||
"issued":{
|
||||
"date-parts":[
|
||||
[
|
||||
1980,
|
||||
4
|
||||
]
|
||||
]
|
||||
},
|
||||
"references-count":22,
|
||||
"journal-issue":{
|
||||
"issue":"2",
|
||||
"published-print":{
|
||||
"date-parts":[
|
||||
[
|
||||
1980,
|
||||
4
|
||||
]
|
||||
]
|
||||
}
|
||||
},
|
||||
"alternative-id":[
|
||||
"10.1068\/p090223"
|
||||
],
|
||||
"URL":"http:\/\/dx.doi.org\/10.1068\/p090223",
|
||||
"relation":{
|
||||
|
||||
},
|
||||
"ISSN":[
|
||||
"0301-0066",
|
||||
"1468-4233"
|
||||
],
|
||||
"issn-type":[
|
||||
{
|
||||
"value":"0301-0066",
|
||||
"type":"print"
|
||||
},
|
||||
{
|
||||
"value":"1468-4233",
|
||||
"type":"electronic"
|
||||
}
|
||||
],
|
||||
"subject":[
|
||||
"Artificial Intelligence",
|
||||
"Sensory Systems",
|
||||
"Experimental and Cognitive Psychology",
|
||||
"Ophthalmology"
|
||||
],
|
||||
"published":{
|
||||
"date-parts":[
|
||||
[
|
||||
1980,
|
||||
4
|
||||
]
|
||||
]
|
||||
}
|
||||
}
|
|
@ -73,6 +73,20 @@ class CrossrefMappingTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
def crossrefIssueDateTest(): Unit = {
|
||||
val json =
|
||||
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty)
|
||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
||||
val items = resultList.filter(p => p.isInstanceOf[Result])
|
||||
|
||||
println(mapper.writeValueAsString(items.head))
|
||||
}
|
||||
|
||||
@Test
|
||||
def testOrcidID(): Unit = {
|
||||
val json = Source
|
||||
|
@ -82,7 +96,7 @@ class CrossrefMappingTest {
|
|||
.mkString
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty);
|
||||
assertFalse(json.isEmpty)
|
||||
|
||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||
|
||||
|
@ -461,6 +475,31 @@ class CrossrefMappingTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testConvertFromCrossRef2OafIssue(): Unit = {
|
||||
val json = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_nojournal.json"))
|
||||
.mkString
|
||||
assertNotNull(json)
|
||||
|
||||
assertFalse(json.isEmpty);
|
||||
|
||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
||||
val items = resultList.filter(p => p.isInstanceOf[Publication])
|
||||
|
||||
assert(items.nonEmpty)
|
||||
assert(items.size == 1)
|
||||
val pub: Publication = items.head.asInstanceOf[Publication]
|
||||
|
||||
assertNotNull(pub.getJournal.getIssnPrinted)
|
||||
assertNotNull(pub.getJournal.getIssnOnline)
|
||||
assertNotNull(pub.getJournal.getName)
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = {
|
||||
|
||||
|
|
|
@ -0,0 +1,251 @@
|
|||
|
||||
package eu.dnetlib.dhp.bulktag;
|
||||
|
||||
import static eu.dnetlib.dhp.PropagationConstant.readPath;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
public class SparkEoscTag {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
public static final Qualifier EOSC_QUALIFIER = OafMapperUtils
|
||||
.qualifier(
|
||||
"EOSC",
|
||||
"European Open Science Cloud",
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES);
|
||||
public static final DataInfo EOSC_DATAINFO = OafMapperUtils
|
||||
.dataInfo(
|
||||
false, "propagation", true, false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"propagation:subject", "Inferred by OpenAIRE",
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.9");
|
||||
public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils
|
||||
.structuredProperty(
|
||||
"EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO);
|
||||
public final static StructuredProperty EOSC_GALAXY = OafMapperUtils
|
||||
.structuredProperty(
|
||||
"EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO);
|
||||
public final static StructuredProperty EOSC_TWITTER = OafMapperUtils
|
||||
.structuredProperty(
|
||||
"EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkEoscTag.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/bulktag/input_eoscTag_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
execEoscTag(spark, inputPath, workingPath);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) {
|
||||
|
||||
readPath(spark, inputPath + "/software", Software.class)
|
||||
.map((MapFunction<Software, Software>) s -> {
|
||||
List<StructuredProperty> sbject;
|
||||
if (!Optional.ofNullable(s.getSubject()).isPresent())
|
||||
s.setSubject(new ArrayList<>());
|
||||
sbject = s.getSubject();
|
||||
|
||||
if (containsCriteriaNotebook(s)) {
|
||||
sbject.add(EOSC_NOTEBOOK);
|
||||
if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))) {
|
||||
sbject = sbject.stream().map(sb -> {
|
||||
if (sb.getValue().equals("EOSC Jupyter Notebook")) {
|
||||
return null;
|
||||
}
|
||||
return sb;
|
||||
}).filter(Objects::nonNull).collect(Collectors.toList());
|
||||
s.setSubject(sbject);
|
||||
}
|
||||
}
|
||||
if (containsCriteriaGalaxy(s)) {
|
||||
sbject.add(EOSC_GALAXY);
|
||||
}
|
||||
return s;
|
||||
}, Encoders.bean(Software.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + "/software");
|
||||
|
||||
readPath(spark, workingPath + "/software", Software.class)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(inputPath + "/software");
|
||||
|
||||
readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class)
|
||||
.map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp -> {
|
||||
List<StructuredProperty> sbject;
|
||||
if (!Optional.ofNullable(orp.getSubject()).isPresent())
|
||||
orp.setSubject(new ArrayList<>());
|
||||
sbject = orp.getSubject();
|
||||
if (containsCriteriaGalaxy(orp)) {
|
||||
sbject.add(EOSC_GALAXY);
|
||||
}
|
||||
if (containscriteriaTwitter(orp)) {
|
||||
sbject.add(EOSC_TWITTER);
|
||||
}
|
||||
return orp;
|
||||
}, Encoders.bean(OtherResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + "/otherresearchproduct");
|
||||
|
||||
readPath(spark, workingPath + "/otherresearchproduct", OtherResearchProduct.class)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(inputPath + "/otherresearchproduct");
|
||||
|
||||
readPath(spark, inputPath + "/dataset", Dataset.class)
|
||||
.map((MapFunction<Dataset, Dataset>) d -> {
|
||||
List<StructuredProperty> sbject;
|
||||
if (!Optional.ofNullable(d.getSubject()).isPresent())
|
||||
d.setSubject(new ArrayList<>());
|
||||
sbject = d.getSubject();
|
||||
if (containscriteriaTwitter(d)) {
|
||||
sbject.add(EOSC_TWITTER);
|
||||
}
|
||||
return d;
|
||||
}, Encoders.bean(Dataset.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + "/dataset");
|
||||
|
||||
readPath(spark, workingPath + "/dataset", Dataset.class)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(inputPath + "/dataset");
|
||||
}
|
||||
|
||||
private static boolean containscriteriaTwitter(Result r) {
|
||||
Set<String> words = getWordsSP(r.getTitle());
|
||||
words.addAll(getWordsF(r.getDescription()));
|
||||
|
||||
if (words.contains("twitter") &&
|
||||
(words.contains("data") || words.contains("dataset")))
|
||||
return true;
|
||||
|
||||
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
|
||||
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data")))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean containsCriteriaGalaxy(Result r) {
|
||||
Set<String> words = getWordsSP(r.getTitle());
|
||||
words.addAll(getWordsF(r.getDescription()));
|
||||
if (words.contains("galaxy") &&
|
||||
words.contains("workflow"))
|
||||
return true;
|
||||
|
||||
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
|
||||
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow")))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean containsCriteriaNotebook(Software s) {
|
||||
if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
|
||||
return true;
|
||||
if (s
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(
|
||||
sbj -> sbj.getValue().toLowerCase().contains("python") &&
|
||||
sbj.getValue().toLowerCase().contains("notebook")))
|
||||
return true;
|
||||
if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
|
||||
s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("notebook")))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private static Set<String> getSubjects(List<StructuredProperty> s) {
|
||||
Set<String> subjects = new HashSet<>();
|
||||
s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" "))));
|
||||
s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase()));
|
||||
return subjects;
|
||||
}
|
||||
|
||||
private static Set<String> getWordsSP(List<StructuredProperty> elem) {
|
||||
Set<String> words = new HashSet<>();
|
||||
Optional
|
||||
.ofNullable(elem)
|
||||
.ifPresent(
|
||||
e -> e
|
||||
.forEach(
|
||||
t -> words
|
||||
.addAll(
|
||||
Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))));
|
||||
return words;
|
||||
}
|
||||
|
||||
private static Set<String> getWordsF(List<Field<String>> elem) {
|
||||
Set<String> words = new HashSet<>();
|
||||
Optional
|
||||
.ofNullable(elem)
|
||||
.ifPresent(
|
||||
e -> e
|
||||
.forEach(
|
||||
t -> words
|
||||
.addAll(
|
||||
Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))));
|
||||
// elem
|
||||
// .forEach(
|
||||
// t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
|
||||
return words;
|
||||
|
||||
}
|
||||
}
|
|
@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -87,6 +88,8 @@ public class PrepareDatasourceCountryAssociation {
|
|||
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class)
|
||||
.filter(
|
||||
(FilterFunction<Datasource>) ds -> !ds.getDataInfo().getDeletedbyinference() &&
|
||||
Optional.ofNullable(ds.getDatasourcetype()).isPresent() &&
|
||||
Optional.ofNullable(ds.getDatasourcetype().getClassid()).isPresent() &&
|
||||
(allowedtypes.contains(ds.getDatasourcetype().getClassid()) ||
|
||||
whitelist.contains(ds.getId())));
|
||||
|
||||
|
|
|
@ -102,21 +102,28 @@ public class SparkCountryPropagationJob {
|
|||
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
||||
return t -> {
|
||||
Optional.ofNullable(t._2()).ifPresent(r -> {
|
||||
if (Optional.ofNullable(t._1().getCountry()).isPresent())
|
||||
t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
|
||||
else
|
||||
t._1().setCountry(merge(null, t._2().getCountrySet()));
|
||||
});
|
||||
return t._1();
|
||||
};
|
||||
}
|
||||
|
||||
private static List<Country> merge(List<Country> c1, List<CountrySbs> c2) {
|
||||
HashSet<String> countries = c1
|
||||
HashSet<String> countries = new HashSet<>();
|
||||
if (Optional.ofNullable(c1).isPresent()) {
|
||||
countries = c1
|
||||
.stream()
|
||||
.map(Qualifier::getClassid)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
HashSet<String> finalCountries = countries;
|
||||
return c2
|
||||
.stream()
|
||||
.filter(c -> !countries.contains(c.getClassid()))
|
||||
.filter(c -> !finalCountries.contains(c.getClassid()))
|
||||
.map(c -> getCountry(c.getClassid(), c.getClassname()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
[
|
||||
{
|
||||
"paramName":"s",
|
||||
"paramLongName":"sourcePath",
|
||||
"paramDescription": "the path of the sequencial file to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the path used to store temporary output files",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ssm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
|
@ -204,7 +204,31 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait" to="End"/>
|
||||
<join name="wait" to="eosc_tag"/>
|
||||
|
||||
<action name="eosc_tag">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>EOSC_tagging</name>
|
||||
<class>eu.dnetlib.dhp.bulktag.SparkEoscTag</class>
|
||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--num-executors=${sparkExecutorNumber}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${outputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/eoscTag</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
|
|
|
@ -0,0 +1,538 @@
|
|||
|
||||
package eu.dnetlib.dhp.bulktag;
|
||||
|
||||
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class EOSCTagJobTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(EOSCTagJobTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files.createTempDirectory(EOSCTagJobTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(EOSCTagJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(EOSCTagJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
void jupyterUpdatesTest() throws Exception {
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
|
||||
Encoders.bean(Software.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/software");
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
|
||||
Encoders.bean(Dataset.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/dataset");
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
|
||||
.map(
|
||||
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
|
||||
.readValue(value, OtherResearchProduct.class),
|
||||
Encoders.bean(OtherResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/otherresearchproduct");
|
||||
|
||||
SparkEoscTag
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath",
|
||||
workingDir.toString() + "/input",
|
||||
"-workingPath", workingDir.toString() + "/working"
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Software> tmp = sc
|
||||
.textFile(workingDir.toString() + "/input/software")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
|
||||
|
||||
Assertions.assertEquals(10, tmp.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
4,
|
||||
tmp
|
||||
.filter(
|
||||
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
5, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertFalse(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
9, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
5, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertFalse(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
9, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
List<StructuredProperty> subjects = tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject();
|
||||
Assertions.assertEquals(8, subjects.size());
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter")));
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("Modeling and Simulation")));
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("structure granulaire")));
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("algorithme")));
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("simulation numérique")));
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de gaz")));
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de liquide")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
10, sc
|
||||
.textFile(workingDir.toString() + "/input/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
0, sc
|
||||
.textFile(workingDir.toString() + "/input/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
|
||||
.filter(
|
||||
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
10, sc
|
||||
.textFile(workingDir.toString() + "/input/otherresearchproduct")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
0, sc
|
||||
.textFile(workingDir.toString() + "/input/otherresearchproduct")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
|
||||
.filter(
|
||||
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
|
||||
.count());
|
||||
|
||||
// spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
void galaxyUpdatesTest() throws Exception {
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
|
||||
Encoders.bean(Software.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/software");
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
|
||||
Encoders.bean(Dataset.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/dataset");
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
|
||||
.map(
|
||||
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
|
||||
.readValue(value, OtherResearchProduct.class),
|
||||
Encoders.bean(OtherResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/otherresearchproduct");
|
||||
|
||||
SparkEoscTag
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath",
|
||||
workingDir.toString() + "/input",
|
||||
"-workingPath", workingDir.toString() + "/working"
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Software> tmp = sc
|
||||
.textFile(workingDir.toString() + "/input/software")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
|
||||
|
||||
Assertions.assertEquals(11, tmp.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
tmp
|
||||
.filter(
|
||||
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
5, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
8, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertFalse(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
JavaRDD<OtherResearchProduct> orp = sc
|
||||
.textFile(workingDir.toString() + "/input/otherresearchproduct")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
|
||||
|
||||
Assertions.assertEquals(10, orp.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
orp
|
||||
.filter(
|
||||
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertFalse(
|
||||
orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertFalse(
|
||||
orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void twitterUpdatesTest() throws Exception {
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/software").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
|
||||
Encoders.bean(Software.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/software");
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/dataset").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
|
||||
Encoders.bean(Dataset.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/dataset");
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct").getPath())
|
||||
.map(
|
||||
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
|
||||
.readValue(value, OtherResearchProduct.class),
|
||||
Encoders.bean(OtherResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/otherresearchproduct");
|
||||
|
||||
SparkEoscTag
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath",
|
||||
workingDir.toString() + "/input",
|
||||
"-workingPath", workingDir.toString() + "/working"
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Software> tmp = sc
|
||||
.textFile(workingDir.toString() + "/input/software")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
|
||||
|
||||
Assertions.assertEquals(10, tmp.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
0,
|
||||
tmp
|
||||
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
|
||||
.count());
|
||||
|
||||
JavaRDD<OtherResearchProduct> orp = sc
|
||||
.textFile(workingDir.toString() + "/input/otherresearchproduct")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
|
||||
|
||||
Assertions.assertEquals(10, orp.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3,
|
||||
orp
|
||||
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
|
||||
.count());
|
||||
|
||||
JavaRDD<Dataset> dats = sc
|
||||
.textFile(workingDir.toString() + "/input/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
||||
|
||||
Assertions.assertEquals(11, dats.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3,
|
||||
dats
|
||||
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
|
||||
.count());
|
||||
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,130 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
public class CleanContextSparkJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(CleanContextSparkJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
CleanContextSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/input_clean_context_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
|
||||
String contextId = parser.get("contextId");
|
||||
log.info("contextId: {}", contextId);
|
||||
|
||||
String verifyParam = parser.get("verifyParam");
|
||||
log.info("verifyParam: {}", verifyParam);
|
||||
|
||||
String graphTableClassName = parser.get("graphTableClassName");
|
||||
log.info("graphTableClassName: {}", graphTableClassName);
|
||||
|
||||
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
|
||||
cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <T extends Result> void cleanContext(SparkSession spark, String contextId, String verifyParam,
|
||||
String inputPath, Class<T> entityClazz, String workingPath) {
|
||||
Dataset<T> res = spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map(
|
||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
||||
Encoders.bean(entityClazz));
|
||||
|
||||
res.map((MapFunction<T, T>) r -> {
|
||||
if (!r
|
||||
.getTitle()
|
||||
.stream()
|
||||
.filter(
|
||||
t -> t
|
||||
.getQualifier()
|
||||
.getClassid()
|
||||
.equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid()))
|
||||
.anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()))) {
|
||||
return r;
|
||||
}
|
||||
r
|
||||
.setContext(
|
||||
r
|
||||
.getContext()
|
||||
.stream()
|
||||
.filter(
|
||||
c -> !c.getId().split("::")[0]
|
||||
.equalsIgnoreCase(contextId))
|
||||
.collect(Collectors.toList()));
|
||||
return r;
|
||||
}, Encoders.bean(entityClazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath);
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(workingPath)
|
||||
.map(
|
||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
||||
Encoders.bean(entityClazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(inputPath);
|
||||
}
|
||||
}
|
|
@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -136,7 +137,7 @@ public class MergeGraphTableSparkJob {
|
|||
/**
|
||||
* Datasources involved in the merge operation doesn't obey to the infra precedence policy, but relies on a custom
|
||||
* behaviour that, given two datasources from beta and prod returns the one from prod with the highest
|
||||
* compatibility among the two.
|
||||
* compatibility among the two. Furthermore, the procedure merges the collectedfrom, originalId, and pid lists.
|
||||
*
|
||||
* @param p datasource from PROD
|
||||
* @param b datasource from BETA
|
||||
|
@ -160,9 +161,37 @@ public class MergeGraphTableSparkJob {
|
|||
|
||||
List<Qualifier> list = Arrays.asList(dp.getOpenairecompatibility(), db.getOpenairecompatibility());
|
||||
dp.setOpenairecompatibility(Collections.min(list, new DatasourceCompatibilityComparator()));
|
||||
dp
|
||||
.setCollectedfrom(
|
||||
Stream
|
||||
.concat(
|
||||
Optional
|
||||
.ofNullable(dp.getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()),
|
||||
Optional
|
||||
.ofNullable(db.getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()))
|
||||
.distinct() // relies on KeyValue.equals
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
dp.setOriginalId(mergeLists(dp.getOriginalId(), db.getOriginalId()));
|
||||
dp.setPid(mergeLists(dp.getPid(), db.getPid()));
|
||||
|
||||
return (P) dp;
|
||||
}
|
||||
|
||||
private static final <T> List<T> mergeLists(final List<T>... lists) {
|
||||
return Arrays
|
||||
.stream(lists)
|
||||
.filter(Objects::nonNull)
|
||||
.flatMap(List::stream)
|
||||
.filter(Objects::nonNull)
|
||||
.distinct()
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToPROD(Optional<P> p, Optional<B> b) {
|
||||
if (b.isPresent() & !p.isPresent()) {
|
||||
return (P) b.get();
|
||||
|
|
|
@ -18,14 +18,7 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.oaiIProvenance;
|
|||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.*;
|
||||
|
@ -35,6 +28,7 @@ import com.google.common.collect.Sets;
|
|||
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.AccessRight;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
|
@ -199,8 +193,13 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
final List<Oaf> oafs = Lists.newArrayList(entity);
|
||||
|
||||
if (!oafs.isEmpty()) {
|
||||
oafs.addAll(addProjectRels(doc, entity));
|
||||
oafs.addAll(addOtherResultRels(doc, entity));
|
||||
Set<Oaf> rels = Sets.newHashSet();
|
||||
|
||||
rels.addAll(addProjectRels(doc, entity));
|
||||
rels.addAll(addOtherResultRels(doc, entity));
|
||||
rels.addAll(addRelations(doc, entity));
|
||||
|
||||
oafs.addAll(rels);
|
||||
}
|
||||
|
||||
return oafs;
|
||||
|
@ -278,6 +277,46 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
return res;
|
||||
}
|
||||
|
||||
private List<Oaf> addRelations(Document doc, OafEntity entity) {
|
||||
|
||||
final List<Oaf> rels = Lists.newArrayList();
|
||||
|
||||
for (Object o : doc.selectNodes("//oaf:relation")) {
|
||||
Element element = (Element) o;
|
||||
|
||||
final String target = StringUtils.trim(element.getText());
|
||||
final String relType = element.attributeValue("relType");
|
||||
final String subRelType = element.attributeValue("subRelType");
|
||||
final String relClass = element.attributeValue("relClass");
|
||||
|
||||
if (StringUtils.isNotBlank(target) && StringUtils.isNotBlank(relType) && StringUtils.isNotBlank(subRelType)
|
||||
&& StringUtils.isNotBlank(relClass)) {
|
||||
|
||||
final String relClassInverse = ModelSupport
|
||||
.findInverse(ModelSupport.rel(relType, subRelType, relClass))
|
||||
.getInverseRelClass();
|
||||
final String validationdDate = ((Node) o).valueOf("@validationDate");
|
||||
|
||||
if (StringUtils.isNotBlank(target)) {
|
||||
final String targetType = element.attributeValue("targetType");
|
||||
if (StringUtils.isNotBlank(targetType)) {
|
||||
final String targetId = createOpenaireId(targetType, target, true);
|
||||
rels
|
||||
.add(
|
||||
getRelation(
|
||||
entity.getId(), targetId, relType, subRelType, relClass, entity, validationdDate));
|
||||
rels
|
||||
.add(
|
||||
getRelation(
|
||||
targetId, entity.getId(), relType, subRelType, relClassInverse, entity,
|
||||
validationdDate));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return rels;
|
||||
}
|
||||
|
||||
protected Relation getRelation(final String source,
|
||||
final String target,
|
||||
final String relType,
|
||||
|
@ -340,7 +379,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
r.setPublisher(preparePublisher(doc, info));
|
||||
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
|
||||
r.setSource(prepareSources(doc, info));
|
||||
r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info));
|
||||
r.setFormat(prepareFormats(doc, info));
|
||||
r.setContributor(prepareContributors(doc, info));
|
||||
r.setResourcetype(prepareResourceType(doc, info));
|
||||
|
|
|
@ -27,15 +27,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
|||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.asString;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
@ -143,8 +135,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
smdbe.execute("queryClaims.sql", smdbe::processClaims);
|
||||
break;
|
||||
case openaire:
|
||||
log.info("Processing datasources...");
|
||||
smdbe.execute("queryDatasources.sql", smdbe::processDatasource, verifyNamespacePrefix);
|
||||
log.info("Processing services...");
|
||||
smdbe.execute("queryServices.sql", smdbe::processService, verifyNamespacePrefix);
|
||||
|
||||
log.info("Processing projects...");
|
||||
if (dbSchema.equalsIgnoreCase("beta")) {
|
||||
|
@ -156,10 +148,10 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
log.info("Processing Organizations...");
|
||||
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization, verifyNamespacePrefix);
|
||||
|
||||
log.info("Processing relationsNoRemoval ds <-> orgs ...");
|
||||
log.info("Processing relations services <-> orgs ...");
|
||||
smdbe
|
||||
.execute(
|
||||
"queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization,
|
||||
"queryServiceOrganization.sql", smdbe::processServiceOrganization,
|
||||
verifyNamespacePrefix);
|
||||
|
||||
log.info("Processing projects <-> orgs ...");
|
||||
|
@ -235,32 +227,30 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
dbClient.processResults(sql, consumer);
|
||||
}
|
||||
|
||||
public List<Oaf> processDatasource(final ResultSet rs) {
|
||||
public List<Oaf> processService(final ResultSet rs) {
|
||||
try {
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
|
||||
final Datasource ds = new Datasource();
|
||||
|
||||
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
|
||||
ds.setId(createOpenaireId(10, rs.getString("id"), true));
|
||||
ds
|
||||
.setOriginalId(
|
||||
Arrays
|
||||
.asList((String[]) rs.getArray("identities").getArray())
|
||||
.asList((String[]) rs.getArray("originalid").getArray())
|
||||
.stream()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toList()));
|
||||
ds
|
||||
.setCollectedfrom(
|
||||
listKeyValues(
|
||||
createOpenaireId(10, rs.getString("collectedfromid"), true),
|
||||
rs.getString("collectedfromname")));
|
||||
ds.setPid(new ArrayList<>());
|
||||
ds.setCollectedfrom(prepareCollectedfrom(rs.getArray("collectedfrom")));
|
||||
ds.setPid(prepareListOfStructProps(rs.getArray("pid"), info));
|
||||
ds.setDateofcollection(asString(rs.getDate("dateofcollection")));
|
||||
ds.setDateoftransformation(null); // Value not returned by the SQL query
|
||||
ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB
|
||||
ds.setOaiprovenance(null); // Values not present in the DB
|
||||
ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype")));
|
||||
ds.setDatasourcetypeui(prepareQualifierSplitting(rs.getString("datasourcetypeui")));
|
||||
ds.setEosctype(prepareQualifierSplitting(rs.getString("eosctype")));
|
||||
ds.setEoscdatasourcetype(prepareQualifierSplitting(rs.getString("eoscdatasourcetype")));
|
||||
ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility")));
|
||||
ds.setOfficialname(field(rs.getString("officialname"), info));
|
||||
ds.setEnglishname(field(rs.getString("englishname"), info));
|
||||
|
@ -277,20 +267,19 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info));
|
||||
ds.setOdpolicies(field(rs.getString("odpolicies"), info));
|
||||
ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info));
|
||||
ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info));
|
||||
ds.setLanguages(listValues(rs.getArray("languages")));
|
||||
ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info));
|
||||
ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info));
|
||||
ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info));
|
||||
ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info));
|
||||
ds.setDataprovider(field(rs.getBoolean("dataprovider"), info));
|
||||
ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info));
|
||||
ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info));
|
||||
ds.setDatauploadtype(field(rs.getString("datauploadtype"), info));
|
||||
ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info));
|
||||
ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info));
|
||||
ds.setVersioning(field(rs.getBoolean("versioning"), info));
|
||||
ds.setVersioncontrol(rs.getBoolean("versioncontrol"));
|
||||
ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info));
|
||||
ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info));
|
||||
|
||||
ds.setPidsystems(field(rs.getString("pidsystems"), info));
|
||||
ds.setCertificates(field(rs.getString("certificates"), info));
|
||||
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
|
||||
|
@ -299,13 +288,18 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
journal(
|
||||
rs.getString("officialname"), rs.getString("issnPrinted"), rs.getString("issnOnline"),
|
||||
rs.getString("issnLinking"), info)); // Journal
|
||||
ds.setDataInfo(info);
|
||||
ds.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
ds.setResearchentitytypes(listValues(rs.getArray("researchentitytypes")));
|
||||
ds.setJurisdiction(prepareQualifierSplitting(rs.getString("jurisdiction")));
|
||||
ds.setThematic(rs.getBoolean("thematic"));
|
||||
ds.setKnowledgegraph(rs.getBoolean("knowledgegraph"));
|
||||
ds.setContentpolicies(prepareListOfQualifiers(rs.getArray("contentpolicies")));
|
||||
ds.setSubmissionpolicyurl(rs.getString("submissionpolicyurl"));
|
||||
ds.setPreservationpolicyurl(rs.getString("preservationpolicyurl"));
|
||||
ds.setResearchproductaccesspolicies(listValues(rs.getArray("researchproductaccesspolicies")));
|
||||
ds
|
||||
.setResearchproductmetadataaccesspolicies(
|
||||
listValues(rs.getArray("researchproductmetadataaccesspolicies")));
|
||||
|
||||
ds.setConsenttermsofuse(rs.getBoolean("consenttermsofuse"));
|
||||
ds.setFulltextdownload(rs.getBoolean("fulltextdownload"));
|
||||
ds
|
||||
|
@ -313,8 +307,18 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
Optional
|
||||
.ofNullable(
|
||||
rs.getDate("consenttermsofusedate"))
|
||||
.map(c -> c.toString())
|
||||
.map(java.sql.Date::toString)
|
||||
.orElse(null));
|
||||
ds
|
||||
.setLastconsenttermsofusedate(
|
||||
Optional
|
||||
.ofNullable(
|
||||
rs.getDate("lastconsenttermsofusedate"))
|
||||
.map(java.sql.Date::toString)
|
||||
.orElse(null));
|
||||
|
||||
ds.setDataInfo(info);
|
||||
ds.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
return Arrays.asList(ds);
|
||||
} catch (final Exception e) {
|
||||
|
@ -425,11 +429,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
}
|
||||
}
|
||||
|
||||
public List<Oaf> processDatasourceOrganization(final ResultSet rs) {
|
||||
public List<Oaf> processServiceOrganization(final ResultSet rs) {
|
||||
try {
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
final String orgId = createOpenaireId(20, rs.getString("organization"), true);
|
||||
final String dsId = createOpenaireId(10, rs.getString("datasource"), true);
|
||||
final String dsId = createOpenaireId(10, rs.getString("service"), true);
|
||||
final List<KeyValue> collectedFrom = listKeyValues(
|
||||
createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"));
|
||||
|
||||
|
@ -603,6 +607,32 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
String.format("%.3f", trust));
|
||||
}
|
||||
|
||||
private List<KeyValue> prepareCollectedfrom(Array values) throws SQLException {
|
||||
if (Objects.isNull(values)) {
|
||||
return null;
|
||||
}
|
||||
return Arrays
|
||||
.stream((String[]) values.getArray())
|
||||
.filter(Objects::nonNull)
|
||||
.distinct()
|
||||
.map(s -> keyValueSplitting(s, "@@@"))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static KeyValue keyValueSplitting(final String s, String separator) {
|
||||
if (StringUtils.isBlank(s)) {
|
||||
return null;
|
||||
}
|
||||
final String[] arr = s.split(separator);
|
||||
if (arr.length != 2) {
|
||||
return null;
|
||||
}
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey(createOpenaireId(10, arr[0], true));
|
||||
kv.setValue(arr[1]);
|
||||
return kv;
|
||||
}
|
||||
|
||||
private Qualifier prepareQualifierSplitting(final String s) {
|
||||
if (StringUtils.isBlank(s)) {
|
||||
return null;
|
||||
|
|
|
@ -65,11 +65,19 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication
|
|||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String mdstoreManagerUrl = parser.get("mdstoreManagerUrl");
|
||||
log.info("mdstoreManagerUrl: {}", mdstoreManagerUrl);
|
||||
|
||||
final String mdFormat = parser.get("mdFormat");
|
||||
log.info("mdFormat: {}", mdFormat);
|
||||
|
||||
final String mdLayout = parser.get("mdLayout");
|
||||
log.info("mdLayout: {}", mdLayout);
|
||||
|
||||
final String mdInterpretation = parser.get("mdInterpretation");
|
||||
log.info("mdInterpretation: {}", mdInterpretation);
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
log.info("hdfsPath: {}", hdfsPath);
|
||||
|
||||
final Set<String> paths = mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation);
|
||||
|
||||
|
@ -95,6 +103,8 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication
|
|||
.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
|
||||
.toArray(size -> new String[size]);
|
||||
|
||||
log.info("Processing existing paths {}", Arrays.asList(validPaths));
|
||||
|
||||
if (validPaths.length > 0) {
|
||||
spark
|
||||
.read()
|
||||
|
|
|
@ -13,6 +13,23 @@
|
|||
<name>isLookupUrl</name>
|
||||
<description>the address of the lookUp service</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shouldCleanContext</name>
|
||||
<description>true if the context have to be cleaned</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>contextId</name>
|
||||
<value>sobigdata</value>
|
||||
<description>It is the context id that should be removed from the result if the condition is matched.
|
||||
Now it is just sobigdata. In a futere implementation I plan to have the contextId as value in a json
|
||||
where to specify also the constraints that should be verified to remove the context from the result</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>verifyParam</name>
|
||||
<value>gcube </value>
|
||||
<description>It is the constrint to be verified. This time is hardcoded as gcube and it is searched for in
|
||||
the title. If title starts with gcube than the context sobigdata will be removed by the result if present</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
|
@ -275,7 +292,131 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_clean" to="End"/>
|
||||
<join name="wait_clean" to="clean_context"/>
|
||||
|
||||
<decision name="clean_context">
|
||||
<switch>
|
||||
<case to="fork_clean_context">${wf:conf('shouldCleanContext') eq true}</case>
|
||||
<default to="End"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
|
||||
<fork name="fork_clean_context">
|
||||
<path start="clean_publication_context"/>
|
||||
<path start="clean_dataset_context"/>
|
||||
<path start="clean_otherresearchproduct_context"/>
|
||||
<path start="clean_software_context"/>
|
||||
</fork>
|
||||
<action name="clean_publication_context">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean publications context</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/publication</arg>
|
||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_context"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="clean_dataset_context">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean datasets Context</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/dataset</arg>
|
||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_context"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="clean_otherresearchproduct_context">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean otherresearchproducts context</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/otherresearchproduct</arg>
|
||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_context"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="clean_software_context">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean softwares context</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/software</arg>
|
||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_context"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_clean_context" to="End"/>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,37 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "in",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the path to the graph data dump to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the path to store the output graph",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ci",
|
||||
"paramLongName": "contextId",
|
||||
"paramDescription": "the id of the context to be removed",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "class",
|
||||
"paramLongName": "graphTableClassName",
|
||||
"paramDescription": "class name moelling the graph table",
|
||||
"paramRequired": true
|
||||
},{
|
||||
"paramName": "vf",
|
||||
"paramLongName": "verifyParam",
|
||||
"paramDescription": "the parameter to be verified to remove the context",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -30,6 +30,11 @@
|
|||
<value></value>
|
||||
<description>a blacklist of nsprefixes (comma separeted)</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>reuseContent</name>
|
||||
<value>false</value>
|
||||
<description>reuse content in the aggregator database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -85,12 +90,20 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="ImportDB"/>
|
||||
<start to="reuse_db"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<decision name="reuse_db">
|
||||
<switch>
|
||||
<case to="ImportDB">${wf:conf('reuseContent') eq false}</case>
|
||||
<case to="GenerateEntities">${wf:conf('reuseContent') eq true}</case>
|
||||
<default to="ImportDB"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="ImportDB">
|
||||
<java>
|
||||
<prepare>
|
||||
|
@ -102,6 +115,7 @@
|
|||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--action</arg><arg>openaire</arg>
|
||||
<arg>--dbschema</arg><arg>${dbSchema}</arg>
|
||||
<arg>--nsPrefixBlacklist</arg><arg>${nsPrefixBlacklist}</arg>
|
||||
</java>
|
||||
|
@ -124,6 +138,55 @@
|
|||
<arg>--action</arg><arg>claims</arg>
|
||||
<arg>--nsPrefixBlacklist</arg><arg>${nsPrefixBlacklist}</arg>
|
||||
</java>
|
||||
<ok to="GenerateEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEntities</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePaths</arg><arg>${contentPath}/db_records,${contentPath}/db_claims</arg>
|
||||
<arg>--targetPath</arg><arg>${workingDir}/entities</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--shouldHashId</arg><arg>true</arg>
|
||||
</spark>
|
||||
<ok to="GenerateGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateGraph">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateGraph</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/entities</arg>
|
||||
<arg>--graphRawPath</arg><arg>${workingDir}/graph_aggregator</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
|
|
@ -27,7 +27,7 @@ SELECT
|
|||
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
|
||||
array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid
|
||||
FROM dsm_organizations o
|
||||
LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom)
|
||||
LEFT OUTER JOIN dsm_services d ON (d.id = o.collectedfrom)
|
||||
LEFT OUTER JOIN dsm_organizationpids p ON (p.organization = o.id)
|
||||
LEFT OUTER JOIN dsm_identities i ON (i.pid = p.pid)
|
||||
GROUP BY
|
||||
|
|
|
@ -16,4 +16,4 @@ SELECT
|
|||
|
||||
FROM project_organization po
|
||||
LEFT OUTER JOIN projects p ON (p.id = po.project)
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom);
|
||||
LEFT OUTER JOIN dsm_services dc ON (dc.id = p.collectedfrom);
|
|
@ -42,7 +42,7 @@ SELECT
|
|||
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
|
||||
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
|
||||
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
|
||||
LEFT OUTER JOIN dsm_services dc ON (dc.id = p.collectedfrom)
|
||||
|
||||
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
|
||||
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
|
||||
|
|
|
@ -40,7 +40,7 @@ SELECT
|
|||
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
|
||||
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
|
||||
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
|
||||
LEFT OUTER JOIN dsm_services dc ON (dc.id = p.collectedfrom)
|
||||
|
||||
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
|
||||
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
SELECT
|
||||
dor.datasource AS datasource,
|
||||
dor.service AS service,
|
||||
dor.organization AS organization,
|
||||
NULL AS startdate,
|
||||
NULL AS enddate,
|
||||
|
@ -11,6 +11,6 @@ SELECT
|
|||
dc.officialname AS collectedfromname,
|
||||
'providedBy@@@dnet:datasources_organizations_typologies' AS semantics,
|
||||
d.provenanceaction || '@@@dnet:provenanceActions' AS provenanceaction
|
||||
FROM dsm_datasource_organization dor
|
||||
LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id)
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom);
|
||||
FROM dsm_service_organization dor
|
||||
LEFT OUTER JOIN dsm_services d ON (dor.service = d.id)
|
||||
LEFT OUTER JOIN dsm_services dc ON (dc.id = d.collectedfrom);
|
|
@ -1,6 +1,7 @@
|
|||
SELECT
|
||||
d.id AS datasourceid,
|
||||
d.id || array_agg(distinct di.pid) AS identities,
|
||||
d.id AS id,
|
||||
array_remove(d.id || array_agg(distinct CASE WHEN dp.pid like 'piwik%' THEN di.pid ELSE NULL END) || array_agg(distinct dds.duplicate), NULL) AS originalid,
|
||||
array_remove(array_agg(distinct CASE WHEN di.pid NOT LIKE 'piwik%' THEN di.pid||'###'||di.issuertype||'@@@'||'dnet:pid_types' ELSE NULL END), NULL) as pid,
|
||||
d.officialname AS officialname,
|
||||
d.englishname AS englishname,
|
||||
d.contactemail AS contactemail,
|
||||
|
@ -40,14 +41,13 @@ SELECT
|
|||
END AS openairecompatibility,
|
||||
d.websiteurl AS websiteurl,
|
||||
d.logourl AS logourl,
|
||||
array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage,
|
||||
array_remove(array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END), NULL) AS accessinfopackage,
|
||||
d.latitude AS latitude,
|
||||
d.longitude AS longitude,
|
||||
d.namespaceprefix AS namespaceprefix,
|
||||
NULL AS odnumberofitems,
|
||||
NULL AS odnumberofitemsdate,
|
||||
|
||||
(SELECT array_agg(s|| '###keywords@@@dnet:subject_classification_typologies')
|
||||
(SELECT array_agg(s|| '###keyword@@@dnet:subject_classification_typologies')
|
||||
FROM UNNEST(
|
||||
ARRAY(
|
||||
SELECT trim(s)
|
||||
|
@ -55,10 +55,15 @@ SELECT
|
|||
|
||||
d.description AS description,
|
||||
NULL AS odpolicies,
|
||||
ARRAY(SELECT trim(s)
|
||||
FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages,
|
||||
ARRAY(SELECT trim(s)
|
||||
FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes,
|
||||
array_remove(ARRAY(SELECT trim(s)
|
||||
FROM unnest(string_to_array(regexp_replace(d.languages, '{|}|"', '', 'g'), ',')) AS s), '{}') AS odlanguages,
|
||||
array_remove(ARRAY(SELECT trim(s)
|
||||
FROM unnest(string_to_array(regexp_replace(d.languages, '{|}|"', '', 'g'), ',')) AS s), '{}') AS languages,
|
||||
-- Term provided only by OpenDOAR:
|
||||
-- probably updating the TR it could be replaced by research_entity_types[]
|
||||
-- But a study on the vocabulary terms is needed
|
||||
-- REMOVED: ARRAY(SELECT trim(s) FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes,
|
||||
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.9 AS trust,
|
||||
|
@ -69,39 +74,59 @@ SELECT
|
|||
d.releasestartdate AS releasestartdate,
|
||||
d.releaseenddate AS releaseenddate,
|
||||
d.missionstatementurl AS missionstatementurl,
|
||||
d.dataprovider AS dataprovider,
|
||||
d.serviceprovider AS serviceprovider,
|
||||
-- the following 2 fields (provided by re3data) have been replaced by research_entity_types[]
|
||||
-- VALUE 'Research Data' : d.dataprovider AS dataprovider,
|
||||
-- VALUE 'Services' : d.serviceprovider AS serviceprovider,
|
||||
d.databaseaccesstype AS databaseaccesstype,
|
||||
d.datauploadtype AS datauploadtype,
|
||||
d.databaseaccessrestriction AS databaseaccessrestriction,
|
||||
d.datauploadrestriction AS datauploadrestriction,
|
||||
d.versioning AS versioning,
|
||||
-- REPLACED BY version_control : d.versioning AS versioning,
|
||||
d.version_control AS versioning,
|
||||
d.version_control AS versioncontrol,
|
||||
d.citationguidelineurl AS citationguidelineurl,
|
||||
d.qualitymanagementkind AS qualitymanagementkind,
|
||||
d.pidsystems AS pidsystems,
|
||||
array_to_string(array_agg(distinct dps.scheme), ' ') AS pidsystems,
|
||||
d.certificates AS certificates,
|
||||
ARRAY[]::text[] AS policies,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
d.typology||'@@@dnet:datasource_typologies' AS datasourcetype,
|
||||
d.typology||'@@@dnet:datasource_typologies_ui' AS datasourcetypeui,
|
||||
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
|
||||
array_remove(
|
||||
array(
|
||||
select distinct cf
|
||||
from unnest(
|
||||
dc.id||'@@@'||dc.officialname || array_agg(distinct dds_cf.id||'@@@'||dds_cf.officialname)
|
||||
) as cf),
|
||||
NULL) AS collectedfrom,
|
||||
d._typology_to_remove_||'@@@dnet:datasource_typologies' AS datasourcetype,
|
||||
d._typology_to_remove_||'@@@dnet:datasource_typologies_ui' AS datasourcetypeui,
|
||||
d.eosc_type||'@@@dnet:eosc_types' AS eosctype,
|
||||
d.eosc_datasource_type||'@@@dnet:eosc_datasource_types' AS eoscdatasourcetype,
|
||||
d.issn AS issnPrinted,
|
||||
d.eissn AS issnOnline,
|
||||
d.lissn AS issnLinking,
|
||||
d.research_entity_types AS researchentitytypes,
|
||||
d.consenttermsofuse AS consenttermsofuse,
|
||||
d.fulltextdownload AS fulltextdownload,
|
||||
d.consenttermsofusedate AS consenttermsofusedate,
|
||||
de.jurisdiction||'@@@eosc:jurisdictions' AS jurisdiction,
|
||||
de.thematic AS thematic,
|
||||
de.knowledge_graph AS knowledgegraph,
|
||||
array(select unnest(de.content_policies)||'@@@eosc:contentpolicies') AS contentpolicies
|
||||
d.lastconsenttermsofusedate AS lastconsenttermsofusedate,
|
||||
d.jurisdiction||'@@@eosc:jurisdictions' AS jurisdiction,
|
||||
d.thematic AS thematic,
|
||||
array(select unnest(d.content_policies)||'@@@eosc:contentpolicies') AS contentpolicies,
|
||||
nullif(trim(d.submission_policy_url), '') AS submissionpolicyurl,
|
||||
nullif(trim(d.preservation_policy_url), '') AS preservationpolicyurl,
|
||||
array_remove(d.research_product_access_policies, '') AS researchproductaccesspolicies,
|
||||
array_remove(d.research_product_metadata_access_policies, '') AS researchproductmetadataaccesspolicies
|
||||
|
||||
FROM dsm_datasources d
|
||||
LEFT OUTER JOIN dsm_datasources_eosc de on (d.id = de.id)
|
||||
LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id)
|
||||
LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource)
|
||||
LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource)
|
||||
FROM dsm_services d
|
||||
LEFT OUTER JOIN dsm_services dc on (d.collectedfrom = dc.id)
|
||||
LEFT OUTER JOIN dsm_api a ON (d.id = a.service)
|
||||
LEFT OUTER JOIN dsm_servicepids dp ON (d.id = dp.service)
|
||||
LEFT OUTER JOIN dsm_identities di ON (dp.pid = di.pid)
|
||||
LEFT OUTER JOIN dsm_dedup_services dds ON (d.id = dds.id)
|
||||
LEFT OUTER JOIN dsm_services dds_dup ON (dds.duplicate = dds_dup.id)
|
||||
LEFT OUTER JOIN dsm_services dds_cf ON (dds_dup.collectedfrom = dds_cf.id)
|
||||
LEFT OUTER JOIN dsm_pid_systems dps ON (d.id = dps.service)
|
||||
|
||||
WHERE
|
||||
d.dedup_main_service = true
|
||||
|
||||
GROUP BY
|
||||
d.id,
|
||||
|
@ -119,23 +144,27 @@ GROUP BY
|
|||
d.releasestartdate,
|
||||
d.releaseenddate,
|
||||
d.missionstatementurl,
|
||||
d.dataprovider,
|
||||
d.serviceprovider,
|
||||
-- TODO REMOVED ???: d.dataprovider,
|
||||
-- TODO REMOVED ???: d.serviceprovider,
|
||||
d.databaseaccesstype,
|
||||
d.datauploadtype,
|
||||
d.databaseaccessrestriction,
|
||||
d.datauploadrestriction,
|
||||
d.versioning,
|
||||
-- REPLACED BY version_control : d.versioning,
|
||||
d.version_control,
|
||||
d.citationguidelineurl,
|
||||
d.qualitymanagementkind,
|
||||
d.pidsystems,
|
||||
-- REMOVED: d.qualitymanagementkind,
|
||||
d.certificates,
|
||||
dc.id,
|
||||
dc.officialname,
|
||||
d.issn,
|
||||
d.eissn,
|
||||
d.lissn,
|
||||
de.jurisdiction,
|
||||
de.thematic,
|
||||
de.knowledge_graph,
|
||||
de.content_policies
|
||||
d.jurisdiction,
|
||||
d.thematic,
|
||||
-- REMOVED ???: de.knowledge_graph,
|
||||
d.content_policies,
|
||||
d.submission_policy_url,
|
||||
d.preservation_policy_url,
|
||||
d.research_product_access_policies,
|
||||
d.research_product_metadata_access_policies
|
|
@ -7,6 +7,7 @@ import org.apache.commons.io.IOUtils
|
|||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkConvertRDDtoDataset {
|
||||
|
||||
|
@ -94,8 +95,8 @@ object SparkConvertRDDtoDataset {
|
|||
log.info("Converting Relation")
|
||||
|
||||
val relationSemanticFilter = List(
|
||||
"cites",
|
||||
"iscitedby",
|
||||
// "cites",
|
||||
// "iscitedby",
|
||||
"merges",
|
||||
"ismergedin",
|
||||
"HasAmongTopNSimilarDocuments",
|
||||
|
@ -107,6 +108,12 @@ object SparkConvertRDDtoDataset {
|
|||
.map(s => mapper.readValue(s, classOf[Relation]))
|
||||
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
|
||||
//filter OpenCitations relations
|
||||
.filter(r =>
|
||||
r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k =>
|
||||
"opencitations".equalsIgnoreCase(k.getValue)
|
||||
)
|
||||
)
|
||||
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
|
||||
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
|
||||
|
||||
|
|
|
@ -1,136 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.graph.pangaea
|
||||
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import java.util.regex.Pattern
|
||||
import scala.language.postfixOps
|
||||
import scala.xml.{Elem, Node, XML}
|
||||
|
||||
case class PangaeaDataModel(
|
||||
identifier: String,
|
||||
title: List[String],
|
||||
objectType: List[String],
|
||||
creator: List[String],
|
||||
publisher: List[String],
|
||||
dataCenter: List[String],
|
||||
subject: List[String],
|
||||
language: String,
|
||||
rights: String,
|
||||
parent: String,
|
||||
relation: List[String],
|
||||
linkage: List[(String, String)]
|
||||
) {}
|
||||
|
||||
object PangaeaUtils {
|
||||
|
||||
def toDataset(input: String): PangaeaDataModel = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
val xml = (json \ "xml").extract[String]
|
||||
parseXml(xml)
|
||||
}
|
||||
|
||||
def findDOIInRelation(input: List[String]): List[String] = {
|
||||
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
|
||||
input
|
||||
.map(i => {
|
||||
val matcher = pattern.matcher(i)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
})
|
||||
.filter(i => i != null)
|
||||
}
|
||||
|
||||
def attributeOpt(attribute: String, node: Node): Option[String] =
|
||||
node.attribute(attribute) flatMap (_.headOption) map (_.text)
|
||||
|
||||
def extractLinkage(node: Elem): List[(String, String)] = {
|
||||
(node \ "linkage")
|
||||
.map(n => (attributeOpt("type", n), n.text))
|
||||
.filter(t => t._1.isDefined)
|
||||
.map(t => (t._1.get, t._2))(collection.breakOut)
|
||||
}
|
||||
|
||||
def parseXml(input: String): PangaeaDataModel = {
|
||||
val xml = XML.loadString(input)
|
||||
|
||||
val identifier = (xml \ "identifier").text
|
||||
val title: List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
|
||||
val pType: List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
|
||||
val creators: List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
|
||||
val publisher: List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
|
||||
val dataCenter: List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
|
||||
val subject: List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
|
||||
val language = (xml \ "language").text
|
||||
val rights = (xml \ "rights").text
|
||||
val parentIdentifier = (xml \ "parentIdentifier").text
|
||||
val relation: List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
|
||||
val relationFiltered = findDOIInRelation(relation)
|
||||
val linkage: List[(String, String)] = extractLinkage(xml)
|
||||
|
||||
PangaeaDataModel(
|
||||
identifier,
|
||||
title,
|
||||
pType,
|
||||
creators,
|
||||
publisher,
|
||||
dataCenter,
|
||||
subject,
|
||||
language,
|
||||
rights,
|
||||
parentIdentifier,
|
||||
relationFiltered,
|
||||
linkage
|
||||
)
|
||||
}
|
||||
|
||||
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
|
||||
new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
|
||||
|
||||
override def zero: PangaeaDataModel = null
|
||||
|
||||
override def reduce(b: PangaeaDataModel, a: (String, PangaeaDataModel)): PangaeaDataModel = {
|
||||
if (b == null)
|
||||
a._2
|
||||
else {
|
||||
if (a == null)
|
||||
b
|
||||
else {
|
||||
if (b.title != null && b.title.nonEmpty)
|
||||
b
|
||||
else
|
||||
a._2
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
override def merge(b1: PangaeaDataModel, b2: PangaeaDataModel): PangaeaDataModel = {
|
||||
if (b1 == null)
|
||||
b2
|
||||
else {
|
||||
if (b2 == null)
|
||||
b1
|
||||
else {
|
||||
if (b1.title != null && b1.title.nonEmpty)
|
||||
b1
|
||||
else
|
||||
b2
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
override def finish(reduction: PangaeaDataModel): PangaeaDataModel = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
||||
|
||||
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
||||
}
|
||||
|
||||
}
|
|
@ -1,58 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.graph.pangaea
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object SparkGeneratePanagaeaDataset {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkGeneratePanagaeaDataset.getClass.getSimpleName)
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}"))
|
||||
logger.info("Converting sequential file into Dataset")
|
||||
val sc: SparkContext = spark.sparkContext
|
||||
|
||||
val workingPath: String = parser.get("workingPath")
|
||||
|
||||
implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
||||
|
||||
val inputRDD: RDD[PangaeaDataModel] =
|
||||
sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
|
||||
|
||||
spark
|
||||
.createDataset(inputRDD)
|
||||
.as[PangaeaDataModel]
|
||||
.map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
|
||||
.map(s => s._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/dataset")
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,300 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.DumpJobTest;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.OpenAccessRoute;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class CleanContextTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files.createTempDirectory(DumpJobTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(DumpJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(DumpJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResultClean() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_context.json")
|
||||
.getPath();
|
||||
final String prefix = "gcube ";
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(sourcePath)
|
||||
.map(
|
||||
(MapFunction<String, Publication>) r -> OBJECT_MAPPER.readValue(r, Publication.class),
|
||||
Encoders.bean(Publication.class))
|
||||
.write()
|
||||
.json(workingDir.toString() + "/publication");
|
||||
|
||||
CleanContextSparkJob.main(new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--inputPath", workingDir.toString() + "/publication",
|
||||
"-graphTableClassName", Publication.class.getCanonicalName(),
|
||||
"-workingPath", workingDir.toString() + "/working",
|
||||
"-contextId", "sobigdata",
|
||||
"-verifyParam", "gCube "
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
JavaRDD<Publication> tmp = sc
|
||||
.textFile(workingDir.toString() + "/publication")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
|
||||
|
||||
Assertions.assertEquals(7, tmp.count());
|
||||
|
||||
// original result with sobigdata context and gcube as starting string in the main title for the publication
|
||||
Assertions
|
||||
.assertEquals(
|
||||
0,
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.size());
|
||||
|
||||
// original result with sobigdata context without gcube as starting string in the main title for the publication
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.size());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"sobigdata::projects::2",
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.get(0)
|
||||
.getId());
|
||||
|
||||
// original result with sobigdata context with gcube as starting string in the subtitle
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.size());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"sobigdata::projects::2",
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.get(0)
|
||||
.getId());
|
||||
List<StructuredProperty> titles = tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getTitle();
|
||||
Assertions.assertEquals(1, titles.size());
|
||||
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
||||
Assertions.assertEquals("subtitle", titles.get(0).getQualifier().getClassid());
|
||||
|
||||
// original result with sobigdata context with gcube not as starting string in the main title
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.size());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"sobigdata::projects::1",
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.get(0)
|
||||
.getId());
|
||||
titles = tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getTitle();
|
||||
Assertions.assertEquals(1, titles.size());
|
||||
Assertions.assertFalse(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
||||
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().contains(prefix.trim()));
|
||||
Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid());
|
||||
|
||||
// original result with sobigdata in context and also other contexts with gcube as starting string for the main
|
||||
// title
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.size());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"dh-ch",
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.get(0)
|
||||
.getId());
|
||||
titles = tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getTitle();
|
||||
Assertions.assertEquals(1, titles.size());
|
||||
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix));
|
||||
Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid());
|
||||
|
||||
// original result with multiple main title one of which whith gcube as starting string and with 2 contextes
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.size());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"dh-ch",
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.get(0)
|
||||
.getId());
|
||||
titles = tmp
|
||||
.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getTitle();
|
||||
Assertions.assertEquals(2, titles.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
titles
|
||||
.stream()
|
||||
.anyMatch(
|
||||
t -> t.getQualifier().getClassid().equals("main title")
|
||||
&& t.getValue().toLowerCase().startsWith(prefix)));
|
||||
|
||||
// original result without sobigdata in context with gcube as starting string for the main title
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.size());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"dh-ch",
|
||||
tmp
|
||||
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getContext()
|
||||
.get(0)
|
||||
.getId());
|
||||
titles = tmp
|
||||
.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getTitle();
|
||||
Assertions.assertEquals(2, titles.size());
|
||||
|
||||
Assertions
|
||||
.assertTrue(
|
||||
titles
|
||||
.stream()
|
||||
.anyMatch(
|
||||
t -> t.getQualifier().getClassid().equals("main title")
|
||||
&& t.getValue().toLowerCase().startsWith(prefix)));
|
||||
|
||||
}
|
||||
}
|
|
@ -2,6 +2,7 @@
|
|||
package eu.dnetlib.dhp.oa.graph.merge;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
@ -25,7 +26,23 @@ class MergeGraphTableSparkJobTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testMergeDatasources() throws IOException {
|
||||
void testMerge() throws IOException {
|
||||
Datasource d = MergeGraphTableSparkJob
|
||||
.mergeDatasource(
|
||||
d("datasource_cris.json"),
|
||||
d("datasource_openaire2.0.json"));
|
||||
|
||||
assertEquals("10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", d.getId());
|
||||
assertNotNull(d.getOriginalId());
|
||||
assertEquals(2, d.getOriginalId().size());
|
||||
assertNotNull(d.getCollectedfrom());
|
||||
assertEquals(2, d.getCollectedfrom().size());
|
||||
assertNotNull(d.getPid());
|
||||
assertEquals(1, d.getPid().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testMergeCompatibility() throws IOException {
|
||||
assertEquals(
|
||||
"openaire-cris_1.1",
|
||||
MergeGraphTableSparkJob
|
||||
|
|
|
@ -57,14 +57,10 @@ class MappersTest {
|
|||
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
assertEquals(3, list.size());
|
||||
assertTrue(list.get(0) instanceof Publication);
|
||||
assertTrue(list.get(1) instanceof Relation);
|
||||
assertTrue(list.get(2) instanceof Relation);
|
||||
assertEquals(1, list.stream().filter(o -> o instanceof Publication).count());
|
||||
assertEquals(4, list.stream().filter(o -> o instanceof Relation).count());
|
||||
|
||||
final Publication p = (Publication) list.get(0);
|
||||
final Relation r1 = (Relation) list.get(1);
|
||||
final Relation r2 = (Relation) list.get(2);
|
||||
Publication p = (Publication) list.stream().filter(o -> o instanceof Publication).findFirst().get();
|
||||
|
||||
assertValidId(p.getId());
|
||||
|
||||
|
@ -125,26 +121,62 @@ class MappersTest {
|
|||
|
||||
assertNotNull(p.getBestaccessright());
|
||||
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
||||
assertValidId(r1.getSource());
|
||||
assertValidId(r1.getTarget());
|
||||
assertValidId(r2.getSource());
|
||||
assertValidId(r2.getTarget());
|
||||
assertValidId(r1.getCollectedfrom().get(0).getKey());
|
||||
assertValidId(r2.getCollectedfrom().get(0).getKey());
|
||||
assertNotNull(r1.getDataInfo());
|
||||
assertNotNull(r2.getDataInfo());
|
||||
assertNotNull(r1.getDataInfo().getTrust());
|
||||
assertNotNull(r2.getDataInfo().getTrust());
|
||||
assertEquals(r1.getSource(), r2.getTarget());
|
||||
assertEquals(r2.getSource(), r1.getTarget());
|
||||
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
|
||||
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
|
||||
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
|
||||
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
|
||||
assertTrue(r1.getValidated());
|
||||
assertTrue(r2.getValidated());
|
||||
assertEquals("2020-01-01", r1.getValidationDate());
|
||||
assertEquals("2020-01-01", r2.getValidationDate());
|
||||
|
||||
assertNotNull(p.getFulltext());
|
||||
assertEquals(1, p.getFulltext().size());
|
||||
assertEquals("https://oneecosystem.pensoft.net/article/13718/", p.getFulltext().get(0).getValue());
|
||||
|
||||
// RESULT PROJECT
|
||||
List<Relation> resultProject = list
|
||||
.stream()
|
||||
.filter(o -> o instanceof Relation)
|
||||
.map(o -> (Relation) o)
|
||||
.filter(r -> ModelConstants.RESULT_PROJECT.equals(r.getRelType()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
assertEquals(2, resultProject.size());
|
||||
final Relation rp1 = resultProject.get(0);
|
||||
final Relation rp2 = resultProject.get(1);
|
||||
|
||||
verifyRelation(rp1);
|
||||
verifyRelation(rp2);
|
||||
|
||||
assertTrue(rp1.getValidated());
|
||||
assertTrue(rp2.getValidated());
|
||||
assertEquals("2020-01-01", rp1.getValidationDate());
|
||||
assertEquals("2020-01-01", rp2.getValidationDate());
|
||||
|
||||
assertEquals(rp1.getSource(), rp2.getTarget());
|
||||
assertEquals(rp2.getSource(), rp1.getTarget());
|
||||
|
||||
// AFFILIATIONS
|
||||
List<Relation> affiliation = list
|
||||
.stream()
|
||||
.filter(o -> o instanceof Relation)
|
||||
.map(o -> (Relation) o)
|
||||
.filter(r -> ModelConstants.RESULT_ORGANIZATION.equals(r.getRelType()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
assertEquals(2, affiliation.size());
|
||||
final Relation aff1 = affiliation.get(0);
|
||||
final Relation aff2 = affiliation.get(1);
|
||||
|
||||
verifyRelation(aff1);
|
||||
verifyRelation(aff2);
|
||||
|
||||
assertEquals(aff1.getSource(), aff2.getTarget());
|
||||
assertEquals(aff2.getSource(), aff1.getTarget());
|
||||
}
|
||||
|
||||
private void verifyRelation(Relation r) {
|
||||
assertValidId(r.getSource());
|
||||
assertValidId(r.getTarget());
|
||||
assertValidId(r.getCollectedfrom().get(0).getKey());
|
||||
assertNotNull(r.getDataInfo());
|
||||
assertNotNull(r.getDataInfo().getTrust());
|
||||
assertTrue(StringUtils.isNotBlank(r.getRelClass()));
|
||||
assertTrue(StringUtils.isNotBlank(r.getRelType()));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -734,6 +766,51 @@ class MappersTest {
|
|||
assertFalse(p_cleaned.getTitle().isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testZenodo() throws IOException, DocumentException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
System.out.println("***************");
|
||||
|
||||
final Publication p = (Publication) list.get(0);
|
||||
assertValidId(p.getId());
|
||||
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||
|
||||
assertNotNull(p.getTitle());
|
||||
assertFalse(p.getTitle().isEmpty());
|
||||
assertEquals(1, p.getTitle().size());
|
||||
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||
|
||||
assertNotNull(p.getAuthor());
|
||||
assertEquals(2, p.getAuthor().size());
|
||||
|
||||
Author author = p
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.filter(a -> a.getPid().stream().anyMatch(pi -> pi.getValue().equals("0000-0003-3272-8007")))
|
||||
.findFirst()
|
||||
.get();
|
||||
assertNotNull(author);
|
||||
assertTrue(StringUtils.isBlank(author.getSurname()));
|
||||
assertTrue(StringUtils.isBlank(author.getName()));
|
||||
assertEquals("Anne van Weerden", author.getFullname());
|
||||
|
||||
author = p
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.filter(a -> a.getPid().stream().anyMatch(pi -> pi.getValue().equals("0000-0003-3272-8008")))
|
||||
.findFirst()
|
||||
.get();
|
||||
assertNotNull(author);
|
||||
assertFalse(StringUtils.isBlank(author.getSurname()));
|
||||
assertFalse(StringUtils.isBlank(author.getName()));
|
||||
assertFalse(StringUtils.isBlank(author.getFullname()));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testOdfFromHdfs() throws IOException, DocumentException {
|
||||
final String xml = IOUtils
|
||||
|
@ -835,6 +912,20 @@ class MappersTest {
|
|||
assertEquals("EUR", p.getProcessingchargecurrency().getValue());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testROHub() throws IOException, DocumentException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
System.out.println("***************");
|
||||
// final Dataset p = (Dataset) list.get(0);
|
||||
// assertValidId(p.getId());
|
||||
// assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||
// System.out.println(p.getTitle().get(0).getValue());
|
||||
// assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||
}
|
||||
|
||||
private void assertValidId(final String id) {
|
||||
// System.out.println(id);
|
||||
|
||||
|
|
|
@ -12,8 +12,11 @@ import java.sql.Array;
|
|||
import java.sql.Date;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -28,12 +31,7 @@ import com.fasterxml.jackson.core.type.TypeReference;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
|
@ -63,22 +61,32 @@ public class MigrateDbEntitiesApplicationTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testProcessDatasource() throws Exception {
|
||||
final List<TypedField> fields = prepareMocks("datasources_resultset_entry.json");
|
||||
public void testProcessService() throws Exception {
|
||||
final List<TypedField> fields = prepareMocks("services_resultset_entry.json");
|
||||
|
||||
final List<Oaf> list = app.processDatasource(rs);
|
||||
final List<Oaf> list = app.processService(rs);
|
||||
assertEquals(1, list.size());
|
||||
verifyMocks(fields);
|
||||
|
||||
final Datasource ds = (Datasource) list.get(0);
|
||||
assertValidId(ds.getId());
|
||||
assertValidId(ds.getCollectedfrom().get(0).getKey());
|
||||
ds
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(KeyValue::getKey)
|
||||
.forEach(dsId -> assertValidId(dsId));
|
||||
|
||||
assertEquals(1, ds.getPid().size());
|
||||
assertEquals("r3d100010218", ds.getPid().get(0).getValue());
|
||||
assertEquals("re3data", ds.getPid().get(0).getQualifier().getClassid());
|
||||
assertEquals("dnet:pid_types", ds.getPid().get(0).getQualifier().getSchemeid());
|
||||
|
||||
assertEquals(getValueAsString("officialname", fields), ds.getOfficialname().getValue());
|
||||
assertEquals(getValueAsString("englishname", fields), ds.getEnglishname().getValue());
|
||||
assertEquals(getValueAsString("contactemail", fields), ds.getContactemail().getValue());
|
||||
assertEquals(getValueAsString("websiteurl", fields), ds.getWebsiteurl().getValue());
|
||||
assertEquals(getValueAsString("logourl", fields), ds.getLogourl());
|
||||
assertEquals(getValueAsString("contactemail", fields), ds.getContactemail().getValue());
|
||||
assertEquals(getValueAsString("namespaceprefix", fields), ds.getNamespaceprefix().getValue());
|
||||
assertEquals(getValueAsString("collectedfromname", fields), ds.getCollectedfrom().get(0).getValue());
|
||||
assertEquals(getValueAsString("officialname", fields), ds.getJournal().getName());
|
||||
assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted());
|
||||
assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline());
|
||||
|
@ -90,19 +98,98 @@ public class MigrateDbEntitiesApplicationTest {
|
|||
assertEquals("pubsrepository::journal", ds.getDatasourcetypeui().getClassid());
|
||||
assertEquals("dnet:datasource_typologies_ui", ds.getDatasourcetypeui().getSchemeid());
|
||||
|
||||
assertEquals("Data Source", ds.getEosctype().getClassid());
|
||||
assertEquals("Data Source", ds.getEosctype().getClassname());
|
||||
assertEquals("dnet:eosc_types", ds.getEosctype().getSchemeid());
|
||||
assertEquals("dnet:eosc_types", ds.getEosctype().getSchemename());
|
||||
|
||||
assertEquals("Journal archive", ds.getEoscdatasourcetype().getClassid());
|
||||
assertEquals("Journal archive", ds.getEoscdatasourcetype().getClassname());
|
||||
assertEquals("dnet:eosc_datasource_types", ds.getEoscdatasourcetype().getSchemeid());
|
||||
assertEquals("dnet:eosc_datasource_types", ds.getEoscdatasourcetype().getSchemename());
|
||||
|
||||
assertEquals("openaire4.0", ds.getOpenairecompatibility().getClassid());
|
||||
assertEquals("openaire4.0", ds.getOpenairecompatibility().getClassname());
|
||||
assertEquals("dnet:datasourceCompatibilityLevel", ds.getOpenairecompatibility().getSchemeid());
|
||||
assertEquals("dnet:datasourceCompatibilityLevel", ds.getOpenairecompatibility().getSchemename());
|
||||
|
||||
assertEquals(getValueAsDouble("latitude", fields).toString(), ds.getLatitude().getValue());
|
||||
assertEquals(getValueAsDouble("longitude", fields).toString(), ds.getLongitude().getValue());
|
||||
assertEquals(getValueAsString("dateofvalidation", fields), ds.getDateofvalidation());
|
||||
|
||||
assertEquals(getValueAsString("description", fields), ds.getDescription().getValue());
|
||||
|
||||
// TODO assertEquals(getValueAsString("subjects", fields), ds.getSubjects());
|
||||
|
||||
assertEquals("0.0", ds.getOdnumberofitems().getValue());
|
||||
assertEquals(getValueAsString("odnumberofitemsdate", fields), ds.getOdnumberofitemsdate());
|
||||
assertEquals(getValueAsString("odpolicies", fields), ds.getOdpolicies());
|
||||
|
||||
assertEquals(
|
||||
getValueAsList("odlanguages", fields),
|
||||
ds.getOdlanguages().stream().map(Field::getValue).collect(Collectors.toList()));
|
||||
assertEquals(getValueAsList("languages", fields), ds.getLanguages());
|
||||
assertEquals(
|
||||
getValueAsList("accessinfopackage", fields),
|
||||
ds.getAccessinfopackage().stream().map(Field::getValue).collect(Collectors.toList()));
|
||||
assertEquals(getValueAsString("releasestartdate", fields), ds.getReleasestartdate());
|
||||
assertEquals(getValueAsString("releaseenddate", fields), ds.getReleasestartdate());
|
||||
assertEquals(getValueAsString("missionstatementurl", fields), ds.getMissionstatementurl());
|
||||
|
||||
assertEquals(null, ds.getDataprovider());
|
||||
assertEquals(null, ds.getServiceprovider());
|
||||
|
||||
assertEquals(getValueAsString("databaseaccesstype", fields), ds.getDatabaseaccesstype());
|
||||
assertEquals(getValueAsString("datauploadtype", fields), ds.getDatauploadtype());
|
||||
assertEquals(getValueAsString("databaseaccessrestriction", fields), ds.getDatabaseaccessrestriction());
|
||||
assertEquals(getValueAsString("datauploadrestriction", fields), ds.getDatauploadrestriction());
|
||||
|
||||
assertEquals(false, ds.getVersioning().getValue());
|
||||
assertEquals(false, ds.getVersioncontrol());
|
||||
|
||||
assertEquals(getValueAsString("citationguidelineurl", fields), ds.getCitationguidelineurl());
|
||||
assertEquals(getValueAsString("pidsystems", fields), ds.getPidsystems());
|
||||
assertEquals(getValueAsString("certificates", fields), ds.getCertificates());
|
||||
|
||||
assertEquals(getValueAsList("researchentitytypes", fields), ds.getResearchentitytypes());
|
||||
|
||||
assertEquals("National", ds.getJurisdiction().getClassid());
|
||||
assertEquals("eosc:jurisdictions", ds.getJurisdiction().getSchemeid());
|
||||
|
||||
assertTrue(ds.getThematic());
|
||||
assertTrue(ds.getKnowledgegraph());
|
||||
|
||||
assertEquals(1, ds.getContentpolicies().size());
|
||||
assertEquals("Journal article", ds.getContentpolicies().get(0).getClassid());
|
||||
assertEquals("eosc:contentpolicies", ds.getContentpolicies().get(0).getSchemeid());
|
||||
HashSet<String> cpSchemeId = ds
|
||||
.getContentpolicies()
|
||||
.stream()
|
||||
.map(Qualifier::getSchemeid)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
assertTrue(cpSchemeId.size() == 1);
|
||||
assertTrue(cpSchemeId.contains("eosc:contentpolicies"));
|
||||
HashSet<String> cpSchemeName = ds
|
||||
.getContentpolicies()
|
||||
.stream()
|
||||
.map(Qualifier::getSchemename)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
assertTrue(cpSchemeName.size() == 1);
|
||||
assertTrue(cpSchemeName.contains("eosc:contentpolicies"));
|
||||
assertEquals(2, ds.getContentpolicies().size());
|
||||
assertEquals("Taxonomic classification", ds.getContentpolicies().get(0).getClassid());
|
||||
assertEquals("Resource collection", ds.getContentpolicies().get(1).getClassid());
|
||||
|
||||
assertEquals(getValueAsString("submissionpolicyurl", fields), ds.getSubmissionpolicyurl());
|
||||
assertEquals(getValueAsString("preservationpolicyurl", fields), ds.getPreservationpolicyurl());
|
||||
|
||||
assertEquals(
|
||||
getValueAsList("researchproductaccesspolicies", fields),
|
||||
ds.getResearchproductaccesspolicies());
|
||||
assertEquals(
|
||||
getValueAsList("researchproductmetadataaccesspolicies", fields),
|
||||
ds.getResearchproductmetadataaccesspolicies());
|
||||
|
||||
assertEquals(true, ds.getConsenttermsofuse());
|
||||
assertEquals(true, ds.getFulltextdownload());
|
||||
assertEquals("2022-03-11", ds.getConsenttermsofusedate());
|
||||
assertEquals("2022-03-11", ds.getLastconsenttermsofusedate());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -154,7 +241,7 @@ public class MigrateDbEntitiesApplicationTest {
|
|||
public void testProcessDatasourceOrganization() throws Exception {
|
||||
final List<TypedField> fields = prepareMocks("datasourceorganization_resultset_entry.json");
|
||||
|
||||
final List<Oaf> list = app.processDatasourceOrganization(rs);
|
||||
final List<Oaf> list = app.processServiceOrganization(rs);
|
||||
|
||||
assertEquals(2, list.size());
|
||||
verifyMocks(fields);
|
||||
|
@ -356,18 +443,31 @@ public class MigrateDbEntitiesApplicationTest {
|
|||
}
|
||||
|
||||
private Float getValueAsFloat(final String name, final List<TypedField> fields) {
|
||||
return new Float(getValueAs(name, fields).toString());
|
||||
final Object value = getValueAs(name, fields);
|
||||
return value != null ? new Float(value.toString()) : null;
|
||||
}
|
||||
|
||||
private Double getValueAsDouble(final String name, final List<TypedField> fields) {
|
||||
final Object value = getValueAs(name, fields);
|
||||
return value != null ? new Double(value.toString()) : null;
|
||||
}
|
||||
|
||||
private Integer getValueAsInt(final String name, final List<TypedField> fields) {
|
||||
final Object value = getValueAs(name, fields);
|
||||
return value != null ? new Integer(value.toString()) : null;
|
||||
}
|
||||
|
||||
private <T> T getValueAs(final String name, final List<TypedField> fields) {
|
||||
return fields
|
||||
final Optional<T> field = fields
|
||||
.stream()
|
||||
.filter(f -> f.getField().equals(name))
|
||||
.map(TypedField::getValue)
|
||||
.filter(Objects::nonNull)
|
||||
.map(o -> (T) o)
|
||||
.findFirst()
|
||||
.get();
|
||||
.map(TypedField::getValue)
|
||||
.map(o -> (T) o);
|
||||
if (!field.isPresent()) {
|
||||
return null;
|
||||
}
|
||||
return field.get();
|
||||
}
|
||||
|
||||
private List<String> getValueAsList(final String name, final List<TypedField> fields) {
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.pangaea
|
||||
|
||||
import eu.dnetlib.dhp.sx.graph.pangaea.PangaeaUtils
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
import java.util.TimeZone
|
||||
import java.text.SimpleDateFormat
|
||||
import java.util.Date
|
||||
import scala.io.Source
|
||||
class PangaeaTransformTest {
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def test_dateStamp() :Unit ={
|
||||
|
||||
|
||||
|
||||
val d = new Date()
|
||||
|
||||
val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format d}Z"
|
||||
println(s)
|
||||
|
||||
|
||||
val xml = Source.fromInputStream(getClass.getResourceAsStream("input.xml")).mkString
|
||||
println(PangaeaUtils.parseXml(xml))
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -497,6 +497,7 @@ dnet:publication_resource @=@ 0044 @=@ Graduate diploma
|
|||
dnet:publication_resource @=@ 0044 @=@ Undergraduate diploma
|
||||
dnet:publication_resource @=@ 0000 @=@ UNKNOWN
|
||||
dnet:publication_resource @=@ 0042 @=@ EGI Virtual Appliance
|
||||
dnet:publication_resource @=@ 0048 @=@ RO-crate
|
||||
dnet:languages @=@ abk @=@ ab
|
||||
dnet:languages @=@ aar @=@ aa
|
||||
dnet:languages @=@ afr @=@ af
|
||||
|
|
|
@ -164,6 +164,7 @@ dnet:publication_resource @=@ dnet:publication_resource @=@ 0030 @=@ Sound
|
|||
dnet:publication_resource @=@ dnet:publication_resource @=@ 0044 @=@ Thesis
|
||||
dnet:publication_resource @=@ dnet:publication_resource @=@ 0000 @=@ Unknown
|
||||
dnet:publication_resource @=@ dnet:publication_resource @=@ 0042 @=@ Virtual Appliance
|
||||
dnet:publication_resource @=@ dnet:publication_resource @=@ 0048 @=@ Research Object
|
||||
ec:funding_typologies @=@ ec:funding_typologies @=@ ec:frameworkprogram @=@ frameworkprogram
|
||||
ec:funding_typologies @=@ ec:funding_typologies @=@ ec:program @=@ program
|
||||
ec:funding_typologies @=@ ec:funding_typologies @=@ ec:specificprogram @=@ specificprogram
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1 +1,5 @@
|
|||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire-cris_1.1" }}
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire-cris_1.1" },
|
||||
"originalId": ["eurocrisdris::1234"],
|
||||
"collectedfrom": [{"key": "eurocrisdris::2b29d08e383ff4cd8a2b6b226ce37e38", "value": "Directory of Research Information System (DRIS)"}],
|
||||
"pid": [{"value": "10.1010.xyx", "qualifier": {"classid": "doi"}}]
|
||||
}
|
|
@ -1 +1,4 @@
|
|||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire2.0" }}
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire2.0" },
|
||||
"originalId": ["opendoar____::1234"],
|
||||
"collectedfrom": [{"key": "openaire____::47ce9e9f4fad46e732cff06419ecaabb", "value": "OpenDOAR"}]
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
[
|
||||
{
|
||||
"field": "datasource",
|
||||
"field": "service",
|
||||
"type": "string",
|
||||
"value": "openaire____::revistasunicauca"
|
||||
},
|
||||
|
|
|
@ -60,6 +60,15 @@
|
|||
<oaf:fulltext>https://oneecosystem.pensoft.net/article/13718/</oaf:fulltext>
|
||||
<oaf:journal eissn="2367-8194" issn="">One Ecosystem</oaf:journal>
|
||||
<oaf:refereed>0001</oaf:refereed>
|
||||
<oaf:relation relClass="hasAuthorInstitution"
|
||||
relType="resultOrganization"
|
||||
subRelType="affiliation"
|
||||
targetType="organization">ror_________::https://ror.org/02gdcn153</oaf:relation>
|
||||
<oaf:relation relClass="isProducedBy"
|
||||
relType="resultProject"
|
||||
subRelType="outcome"
|
||||
targetType="project"
|
||||
validationDate="2020-01-01">corda_______::226852</oaf:relation>
|
||||
</metadata>
|
||||
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:datacite="http://datacite.org/schema/kernel-3"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri">>
|
||||
<header xmlns="http://www.openarchives.org/OAI/2.0/">
|
||||
<identifier>oai:zenodo.org:3406824</identifier>
|
||||
<datestamp>2020-01-20T16:45:20Z</datestamp>
|
||||
<setSpec>openaire</setSpec>
|
||||
<dr:dateOfTransformation>2022-06-07T10:21:24.06Z</dr:dateOfTransformation>
|
||||
<dri:objIdentifier>test________::92fe3efa47883b2f3401e6a4bd92e9d7</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2020-05-21T05:26:15.93Z</dri:dateOfCollection>
|
||||
<dri:dateOfTransformation>2020-08-01T11:06:26.977Z</dri:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<resource xmlns="http://datacite.org/schema/kernel-4"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd">
|
||||
<identifier identifierType="DOI">10.5281/zenodo.3406824</identifier>
|
||||
<alternateIdentifiers xmlns="http://datacite.org/schema/kernel-3">
|
||||
<alternateIdentifier alternateIdentifierType="URL">http://dx.doi.org/10.5281/zenodo.3406824</alternateIdentifier>
|
||||
</alternateIdentifiers>
|
||||
<creators>
|
||||
<creator>
|
||||
<creatorName>Anne van Weerden</creatorName>
|
||||
<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0003-3272-8007</nameIdentifier>
|
||||
<affiliation>Utrecht University Library</affiliation>
|
||||
</creator>
|
||||
<creator>
|
||||
<creatorName>Anne van, Weerden</creatorName>
|
||||
<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0003-3272-8008</nameIdentifier>
|
||||
<affiliation>Utrecht University Library</affiliation>
|
||||
</creator>
|
||||
</creators>
|
||||
<titles>
|
||||
<title>Helen Bayly and Catherine Disney as influences in the life of Sir William Rowan Hamilton</title>
|
||||
</titles>
|
||||
<publisher>Zenodo</publisher>
|
||||
<publicationYear>2018</publicationYear>
|
||||
<subjects>
|
||||
<subject>Sir William Rowan Hamilton, Lady Helena Maria Hamilton Bayly, Catherine Disney, Ireland, history, biography, nineteenth century</subject>
|
||||
</subjects>
|
||||
<dates>
|
||||
<date dateType="Issued">2018-12-28</date>
|
||||
</dates>
|
||||
<language>en</language>
|
||||
<resourceType resourceTypeGeneral="JournalArticle"/>
|
||||
<relatedIdentifiers>
|
||||
<relatedIdentifier relatedIdentifierType="DOI" relationType="IsVersionOf">10.5281/zenodo.3406823</relatedIdentifier>
|
||||
</relatedIdentifiers>
|
||||
<rightsList>
|
||||
<rights rightsURI="https://creativecommons.org/licenses/by/4.0/legalcode">Creative Commons Attribution 4.0 International</rights>
|
||||
<rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
|
||||
</rightsList>
|
||||
<descriptions>
|
||||
<description descriptionType="Abstract"><p>In the 1880s Robert Graves published a biography about Sir William Rowan Hamilton (1805-1865), to which in a 1980 biography Thomas Hankins added further information. From these biographies a picture emerged of a man who was unhappily married because he had lost the love of his life, which raised the question how such an unhappy man could produce so much beautiful mathematics. In this article it is stated that a main cause for the unhappy picture is that Graves ignored the influence on one another of Hamilton and his wife Helen Bayly, and Hankins that of Hamilton and his first and lost love Catherine Disney. It is then shown that if these influences are taken into account a very different view on Hamilton;s private life arises, in which he was happily married to a wife who enabled him to work as he needed to.</p></description>
|
||||
</descriptions>
|
||||
</resource>
|
||||
<oaf:identifier identifierType="doi">10.5281/zenodo.3406824</oaf:identifier>
|
||||
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2018-12-28</oaf:dateAccepted>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:license>https://creativecommons.org/licenses/by/4.0/legalcode</oaf:license>
|
||||
<oaf:language>eng</oaf:language>
|
||||
<oaf:hostedBy name="ZENODO" id="opendoar____::2659"/>
|
||||
<oaf:collectedFrom name="ZENODO" id="opendoar____::2659"/>
|
||||
</metadata>
|
||||
</record>
|
|
@ -0,0 +1,103 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:datacite="http://datacite.org/schema/kernel-4"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
|
||||
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header xmlns="http://www.openarchives.org/OAI/2.0/">
|
||||
<dri:objIdentifier>eosca5322f5f::4dd1aaf93ae136b65dc9ee4e6f76eac9</dri:objIdentifier>
|
||||
<dri:recordIdentifier>53aa90bf-c593-4e6d-923f-d4711ac4b0e1</dri:recordIdentifier>
|
||||
<dri:dateOfCollection>2022-05-25T15:35:48.262Z</dri:dateOfCollection>
|
||||
<oaf:datasourceprefix>eosca5322f5f</oaf:datasourceprefix>
|
||||
<identifier>53aa90bf-c593-4e6d-923f-d4711ac4b0e1</identifier>
|
||||
<datestamp>2022-05-25T15:35:38Z</datestamp>
|
||||
<setSpec>rohub_data</setSpec>
|
||||
<setSpec>ro-crate_data</setSpec>
|
||||
<dr:dateOfTransformation>2022-05-25T15:36:11.094Z</dr:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaire:resource xmlns="http://namespace.openaire.eu/schema/oaire/">
|
||||
<datacite:identifier identifierType="landingPage">https://w3id.org/ro-id/53aa90bf-c593-4e6d-923f-d4711ac4b0e1</datacite:identifier>
|
||||
<datacite:alternateIdentifiers>
|
||||
<datacite:alternateIdentifier alternateIdentifierType="URL">http://api.rohub.org/api/ros/53aa90bf-c593-4e6d-923f-d4711ac4b0e1/</datacite:alternateIdentifier>
|
||||
</datacite:alternateIdentifiers>
|
||||
<datacite:relatedIdentifiers>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="" relationType="">
|
||||
https://github.com/NordicESMhub/RELIANCE/blob/main/content/science/notebooks/air_quality_lockdown.ipynb
|
||||
</datacite:relatedIdentifier>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="URI" relationType="IsPartOf">https://github.com/NordicESMhub/RELIANCE/blob/main/content/science/notebooks/air_quality_lockdown.ipynb</datacite:relatedIdentifier>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="" relationType="">
|
||||
https://nordicesmhub.github.io/RELIANCE/science/notebooks/air_quality_lockdown.html
|
||||
</datacite:relatedIdentifier>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="URI" relationType="IsPartOf">https://nordicesmhub.github.io/RELIANCE/science/notebooks/air_quality_lockdown.html</datacite:relatedIdentifier>
|
||||
</datacite:relatedIdentifiers>
|
||||
<creators xmlns="http://datacite.org/schema/kernel-4">
|
||||
<creator>
|
||||
<creator>
|
||||
<creatorName>Anne Fouilloux</creatorName>
|
||||
</creator>
|
||||
</creator>
|
||||
</creators>
|
||||
<dates xmlns="http://datacite.org/schema/kernel-4">
|
||||
<date dateType="Created">2021-12-19T21:18:33Z</date>
|
||||
</dates>
|
||||
<dc:descriptions>
|
||||
<dc:description descriptionType="Abstract">The COVID-19 pandemic has led to significant reductions in economic activity, especially during lockdowns. Several studies has shown that the concentration of nitrogen dioxyde and particulate matter levels have reduced during lockdown events. Reductions in transportation sector emissions are most likely largely responsible for the NO2 anomalies. In this study, we analyze the impact of lockdown events on the air quality using data from Copernicus Atmosphere Monitoring Service over Europe and at selected locations.</dc:description>
|
||||
</dc:descriptions>
|
||||
<oaire:fundingReferences>
|
||||
<oaire:fundingReference>
|
||||
<oaire:funderName>European Commission</oaire:funderName>
|
||||
<oaire:funderIdentifier funderIdentifierType="Crossref Funder ID">10.13039/501100000781</oaire:funderIdentifier>
|
||||
<oaire:awardNumber awardURI="">101017502</oaire:awardNumber>
|
||||
<oaire:awardTitle>Research Lifecycle Management for Earth Science Communities and Copernicus Users</oaire:awardTitle>
|
||||
</oaire:fundingReference>
|
||||
</oaire:fundingReferences>
|
||||
<oaire:licenseCondition uri="https://opensource.org/licenses/MIT">MIT License</oaire:licenseCondition>
|
||||
<dc:publisher>University of Oslo</dc:publisher>
|
||||
<dc:publicationYear>2021</dc:publicationYear>
|
||||
<oaire:resourceType resourceTypeGeneral="other research product" uri="http://purl.org/coar/resource_type/c_1843">RO-crate</oaire:resourceType>
|
||||
<rightsList xmlns="http://datacite.org/schema/kernel-4">
|
||||
<rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</rights>
|
||||
</rightsList>
|
||||
<sizes xmlns="http://datacite.org/schema/kernel-4">
|
||||
<size>11.971 MB</size>
|
||||
</sizes>
|
||||
<subjects xmlns="http://datacite.org/schema/kernel-4">
|
||||
<subject>Applied sciences</subject>
|
||||
<subject>Meteorology</subject>
|
||||
<subject>EOSC::RO-crate</subject>
|
||||
</subjects>
|
||||
<titles xmlns="http://datacite.org/schema/kernel-4">
|
||||
<title>Impact of the Covid-19 Lockdown on Air quality over Europe</title>
|
||||
</titles>
|
||||
</oaire:resource>
|
||||
<oaf:identifier identifierType="URL">https://w3id.org/ro-id/53aa90bf-c593-4e6d-923f-d4711ac4b0e1</oaf:identifier>
|
||||
<dr:CobjCategory type="other">0048</dr:CobjCategory>
|
||||
<oaf:dateAccepted/>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:license>https://opensource.org/licenses/MIT</oaf:license>
|
||||
<oaf:language>und</oaf:language>
|
||||
<oaf:hostedBy id="eosc________::psnc::psnc.rohub" name="ROHub"/>
|
||||
<oaf:collectedFrom id="eosc________::psnc::psnc.rohub" name="ROHub"/>
|
||||
</metadata>
|
||||
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2022-05-25T15:35:48.262Z">
|
||||
<baseURL>https%3A%2F%2Fapi.rohub.org%2Fapi%2Foai2d%2F</baseURL>
|
||||
<identifier>53aa90bf-c593-4e6d-923f-d4711ac4b0e1</identifier>
|
||||
<datestamp>2022-05-25T15:35:38Z</datestamp>
|
||||
<metadataNamespace/>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk"
|
||||
classname="Harvested" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</record>
|
|
@ -1,18 +1,51 @@
|
|||
[
|
||||
{
|
||||
"field": "datasourceid",
|
||||
"field": "id",
|
||||
"type": "string",
|
||||
"value": "274269ac6f3b::2579-5449"
|
||||
},
|
||||
{
|
||||
"field": "identities",
|
||||
"field": "originalid",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"274269ac6f3b::2579-5449",
|
||||
"fairsharing_::1562",
|
||||
"piwik:13",
|
||||
null
|
||||
null,
|
||||
"re3data_____::r3d100010213"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "pid",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"r3d100010218###re3data@@@dnet:pid_types"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "datasourcetype",
|
||||
"type": "string",
|
||||
"value": "pubsrepository::journal@@@dnet:datasource_typologies"
|
||||
},
|
||||
{
|
||||
"field": "datasourcetypeui",
|
||||
"type": "string",
|
||||
"value": "pubsrepository::journal@@@dnet:datasource_typologies_ui"
|
||||
},
|
||||
{
|
||||
"field": "eosctype",
|
||||
"type": "string",
|
||||
"value": "Data Source@@@dnet:eosc_types"
|
||||
},
|
||||
{
|
||||
"field": "eoscdatasourcetype",
|
||||
"type": "string",
|
||||
"value": "Journal archive@@@dnet:eosc_datasource_types"
|
||||
},
|
||||
{
|
||||
"field": "openairecompatibility",
|
||||
"type": "string",
|
||||
"value": "openaire4.0@@@dnet:datasourceCompatibilityLevel"
|
||||
},
|
||||
{
|
||||
"field": "officialname",
|
||||
"type": "string",
|
||||
|
@ -23,16 +56,6 @@
|
|||
"type": "string",
|
||||
"value": "Jurnal Ilmiah Pendidikan Scholastic"
|
||||
},
|
||||
{
|
||||
"field": "contactemail",
|
||||
"type": "string",
|
||||
"value": "test@test.it"
|
||||
},
|
||||
{
|
||||
"field": "openairecompatibility",
|
||||
"type": "string",
|
||||
"value": "hostedBy@@@dnet:datasourceCompatibilityLevel"
|
||||
},
|
||||
{
|
||||
"field": "websiteurl",
|
||||
"type": "string",
|
||||
|
@ -44,11 +67,14 @@
|
|||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "accessinfopackage",
|
||||
"type": "array",
|
||||
"value": [
|
||||
null
|
||||
]
|
||||
"field": "contactemail",
|
||||
"type": "string",
|
||||
"value": "test@test.it"
|
||||
},
|
||||
{
|
||||
"field": "namespaceprefix",
|
||||
"type": "string",
|
||||
"value": "ojs_25795449"
|
||||
},
|
||||
{
|
||||
"field": "latitude",
|
||||
|
@ -61,9 +87,19 @@
|
|||
"value": 0
|
||||
},
|
||||
{
|
||||
"field": "namespaceprefix",
|
||||
"field": "dateofvalidation",
|
||||
"type": "date",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "description",
|
||||
"type": "string",
|
||||
"value": "ojs_25795449"
|
||||
"value": "veterinary medicine"
|
||||
},
|
||||
{
|
||||
"field": "subjects",
|
||||
"type": "array",
|
||||
"value": []
|
||||
},
|
||||
{
|
||||
"field": "odnumberofitems",
|
||||
|
@ -75,16 +111,6 @@
|
|||
"type": "date",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "subjects",
|
||||
"type": "array",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "description",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "odpolicies",
|
||||
"type": "string",
|
||||
|
@ -93,44 +119,33 @@
|
|||
{
|
||||
"field": "odlanguages",
|
||||
"type": "array",
|
||||
"value": []
|
||||
},
|
||||
{
|
||||
"field": "odcontenttypes",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"Journal articles"
|
||||
"English",
|
||||
"German",
|
||||
"French",
|
||||
"Danish",
|
||||
"Norwegian",
|
||||
"Swedish"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "inferred",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
"field": "languages",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"English",
|
||||
"German",
|
||||
"French",
|
||||
"Danish",
|
||||
"Norwegian",
|
||||
"Swedish"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "deletedbyinference",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "trust",
|
||||
"type": "double",
|
||||
"value": 0.9
|
||||
},
|
||||
{
|
||||
"field": "inferenceprovenance",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "dateofcollection",
|
||||
"type": "date",
|
||||
"value": "2020-01-21"
|
||||
},
|
||||
{
|
||||
"field": "dateofvalidation",
|
||||
"type": "date",
|
||||
"value": null
|
||||
"field": "accessinfopackage",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "releasestartdate",
|
||||
|
@ -147,16 +162,6 @@
|
|||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "dataprovider",
|
||||
"type": "boolean",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "serviceprovider",
|
||||
"type": "boolean",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "databaseaccesstype",
|
||||
"type": "string",
|
||||
|
@ -183,12 +188,12 @@
|
|||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "citationguidelineurl",
|
||||
"type": "string",
|
||||
"field": "versioncontrol",
|
||||
"type": "boolean",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "qualitymanagementkind",
|
||||
"field": "citationguidelineurl",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
|
@ -208,29 +213,38 @@
|
|||
"value": []
|
||||
},
|
||||
{
|
||||
"field": "collectedfromid",
|
||||
"type": "string",
|
||||
"value": "openaire____::SnVybmFsIEZha3VsdGFzIFNhc3RyYSBVbml2ZXJzaXRhcyBFa2FzYWt0aQ=="
|
||||
"field": "inferred",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "collectedfromname",
|
||||
"type": "string",
|
||||
"value": "Jurnal Fakultas Sastra Universitas Ekasakti"
|
||||
"field": "deletedbyinference",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "datasourcetype",
|
||||
"type": "string",
|
||||
"value": "pubsrepository::journal@@@dnet:datasource_typologies"
|
||||
"field": "trust",
|
||||
"type": "double",
|
||||
"value": 0.9
|
||||
},
|
||||
{
|
||||
"field": "datasourcetypeui",
|
||||
"field": "inferenceprovenance",
|
||||
"type": "string",
|
||||
"value": "pubsrepository::journal@@@dnet:datasource_typologies_ui"
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "provenanceaction",
|
||||
"type": "not_used",
|
||||
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions"
|
||||
"field": "dateofcollection",
|
||||
"type": "date",
|
||||
"value": "2020-01-21"
|
||||
},
|
||||
{
|
||||
"field": "collectedfrom",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"openaire____::fairsharing@@@FAIRsharing.org",
|
||||
"openaire____::opendoar@@@OpenDOAR",
|
||||
"openaire____::re3data@@@Registry of Research Data Repository"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "issnPrinted",
|
||||
|
@ -247,6 +261,13 @@
|
|||
"type": "string",
|
||||
"value": "2579-5447"
|
||||
},
|
||||
{
|
||||
"field": "researchentitytypes",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"Research Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "jurisdiction",
|
||||
"type": "string",
|
||||
|
@ -257,16 +278,36 @@
|
|||
"type": "boolean",
|
||||
"value": true
|
||||
},
|
||||
{
|
||||
"field": "knowledgegraph",
|
||||
"type": "boolean",
|
||||
"value": true
|
||||
},
|
||||
{
|
||||
"field": "contentpolicies",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"Journal article@@@eosc:contentpolicies"
|
||||
"Taxonomic classification@@@eosc:contentpolicies",
|
||||
"Resource collection@@@eosc:contentpolicies"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "submissionpolicyurl",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "preservationpolicyurl",
|
||||
"type": "string",
|
||||
"value": "Permanent Archiving https://datadryad.org/stash/faq"
|
||||
},
|
||||
{
|
||||
"field": "researchproductaccesspolicies",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"https://100percentit.com/legal/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "researchproductmetadataaccesspolicies",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"https://wenmr.science.uu.nl/conditions"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -283,5 +324,10 @@
|
|||
"field": "consenttermsofusedate",
|
||||
"type": "date",
|
||||
"value": "2022-03-11"
|
||||
},
|
||||
{
|
||||
"field": "lastconsenttermsofusedate",
|
||||
"type": "date",
|
||||
"value": "2022-03-11"
|
||||
}
|
||||
]
|
Binary file not shown.
|
@ -535,6 +535,12 @@ public class XmlRecordFactory implements Serializable {
|
|||
if (ds.getDatasourcetypeui() != null) {
|
||||
metadata.add(XmlSerializationUtils.mapQualifier("datasourcetypeui", ds.getDatasourcetypeui()));
|
||||
}
|
||||
if (ds.getEosctype() != null) {
|
||||
metadata.add(XmlSerializationUtils.mapQualifier("eosctype", ds.getEosctype()));
|
||||
}
|
||||
if (ds.getEoscdatasourcetype() != null) {
|
||||
metadata.add(XmlSerializationUtils.mapQualifier("eoscdatasourcetype", ds.getEoscdatasourcetype()));
|
||||
}
|
||||
if (ds.getOpenairecompatibility() != null) {
|
||||
metadata
|
||||
.add(
|
||||
|
@ -583,6 +589,16 @@ public class XmlRecordFactory implements Serializable {
|
|||
metadata
|
||||
.add(XmlSerializationUtils.asXmlElement("description", ds.getDescription().getValue()));
|
||||
}
|
||||
if (ds.getSubjects() != null) {
|
||||
metadata
|
||||
.addAll(
|
||||
ds
|
||||
.getSubjects()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getOdnumberofitems() != null) {
|
||||
metadata
|
||||
.add(
|
||||
|
@ -609,6 +625,16 @@ public class XmlRecordFactory implements Serializable {
|
|||
.map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getLanguages() != null) {
|
||||
metadata
|
||||
.addAll(
|
||||
ds
|
||||
.getLanguages()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(c -> XmlSerializationUtils.asXmlElement("languages", c))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getOdcontenttypes() != null) {
|
||||
metadata
|
||||
.addAll(
|
||||
|
@ -689,18 +715,18 @@ public class XmlRecordFactory implements Serializable {
|
|||
XmlSerializationUtils
|
||||
.asXmlElement("versioning", ds.getVersioning().getValue().toString()));
|
||||
}
|
||||
if (ds.getVersioncontrol() != null) {
|
||||
metadata
|
||||
.add(
|
||||
XmlSerializationUtils
|
||||
.asXmlElement("versioncontrol", ds.getVersioncontrol().toString()));
|
||||
}
|
||||
if (ds.getCitationguidelineurl() != null) {
|
||||
metadata
|
||||
.add(
|
||||
XmlSerializationUtils
|
||||
.asXmlElement("citationguidelineurl", ds.getCitationguidelineurl().getValue()));
|
||||
}
|
||||
if (ds.getQualitymanagementkind() != null) {
|
||||
metadata
|
||||
.add(
|
||||
XmlSerializationUtils
|
||||
.asXmlElement("qualitymanagementkind", ds.getQualitymanagementkind().getValue()));
|
||||
}
|
||||
if (ds.getPidsystems() != null) {
|
||||
metadata
|
||||
.add(XmlSerializationUtils.asXmlElement("pidsystems", ds.getPidsystems().getValue()));
|
||||
|
@ -722,17 +748,24 @@ public class XmlRecordFactory implements Serializable {
|
|||
if (ds.getJournal() != null) {
|
||||
metadata.add(XmlSerializationUtils.mapJournal(ds.getJournal()));
|
||||
}
|
||||
if (ds.getSubjects() != null) {
|
||||
if (ds.getResearchentitytypes() != null) {
|
||||
metadata
|
||||
.addAll(
|
||||
ds
|
||||
.getSubjects()
|
||||
.getResearchentitytypes()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp))
|
||||
.map(c -> XmlSerializationUtils.asXmlElement("researchentitytypes", c))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getProvidedproducttypes() != null) {
|
||||
metadata
|
||||
.addAll(
|
||||
ds
|
||||
.getProvidedproducttypes()
|
||||
.stream()
|
||||
.map(c -> XmlSerializationUtils.asXmlElement("providedproducttypes", c))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (ds.getJurisdiction() != null) {
|
||||
metadata.add(XmlSerializationUtils.mapQualifier("jurisdiction", ds.getJurisdiction()));
|
||||
}
|
||||
|
@ -741,11 +774,6 @@ public class XmlRecordFactory implements Serializable {
|
|||
metadata.add(XmlSerializationUtils.asXmlElement("thematic", ds.getThematic().toString()));
|
||||
}
|
||||
|
||||
if (ds.getKnowledgegraph() != null) {
|
||||
metadata
|
||||
.add(XmlSerializationUtils.asXmlElement("knowledgegraph", ds.getKnowledgegraph().toString()));
|
||||
}
|
||||
|
||||
if (ds.getContentpolicies() != null) {
|
||||
metadata
|
||||
.addAll(
|
||||
|
@ -756,7 +784,34 @@ public class XmlRecordFactory implements Serializable {
|
|||
.map(q -> XmlSerializationUtils.mapQualifier("contentpolicy", q))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (ds.getSubmissionpolicyurl() != null) {
|
||||
metadata
|
||||
.add(XmlSerializationUtils.asXmlElement("submissionpolicyurl", ds.getSubmissionpolicyurl()));
|
||||
}
|
||||
if (ds.getPreservationpolicyurl() != null) {
|
||||
metadata
|
||||
.add(
|
||||
XmlSerializationUtils.asXmlElement("preservationpolicyurl", ds.getPreservationpolicyurl()));
|
||||
}
|
||||
if (ds.getResearchproductaccesspolicies() != null) {
|
||||
metadata
|
||||
.addAll(
|
||||
ds
|
||||
.getResearchproductaccesspolicies()
|
||||
.stream()
|
||||
.map(c -> XmlSerializationUtils.asXmlElement("researchproductaccesspolicies", c))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getResearchproductmetadataaccesspolicies() != null) {
|
||||
metadata
|
||||
.addAll(
|
||||
ds
|
||||
.getResearchproductmetadataaccesspolicies()
|
||||
.stream()
|
||||
.map(
|
||||
c -> XmlSerializationUtils.asXmlElement("researchproductmetadataaccesspolicies", c))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
break;
|
||||
case organization:
|
||||
final Organization o = (Organization) entity;
|
||||
|
@ -944,17 +999,11 @@ public class XmlRecordFactory implements Serializable {
|
|||
private List<String> measuresAsXml(List<Measure> measures) {
|
||||
return measures
|
||||
.stream()
|
||||
.flatMap(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.map(
|
||||
u -> Lists
|
||||
.newArrayList(
|
||||
new Tuple2<>("id", m.getId()),
|
||||
new Tuple2<>("key", u.getKey()),
|
||||
new Tuple2<>("value", u.getValue())))
|
||||
.map(l -> XmlSerializationUtils.asXmlElement("measure", l)))
|
||||
.map(m -> {
|
||||
List<Tuple2<String, String>> l = Lists.newArrayList(new Tuple2<>("id", m.getId()));
|
||||
m.getUnit().forEach(kv -> l.add(new Tuple2<>(kv.getKey(), kv.getValue())));
|
||||
return XmlSerializationUtils.asXmlElement("measure", l);
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
|
|
@ -41,7 +41,8 @@
|
|||
}
|
||||
},
|
||||
"publicationDate": {
|
||||
"type": "keyword"
|
||||
"type": "date",
|
||||
"format": "yyyy-MM-dd"
|
||||
},
|
||||
"relationship": {
|
||||
"properties": {
|
||||
|
|
|
@ -11,6 +11,7 @@ import java.util.List;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
|
@ -72,11 +73,20 @@ public class XmlRecordFactoryTest {
|
|||
assertEquals("EUR", doc.valueOf("//processingchargecurrency/text()"));
|
||||
|
||||
assertEquals(
|
||||
"1.00889953098e-08", doc.valueOf("//*[local-name() = 'result']/measure[./@id = 'influence']/@value"));
|
||||
"5.06690394631e-09", doc.valueOf("//*[local-name() = 'result']/measure[./@id = 'influence']/@score"));
|
||||
assertEquals(
|
||||
"30.6576853333", doc.valueOf("//*[local-name() = 'result']/measure[./@id = 'popularity_alt']/@value"));
|
||||
"C", doc.valueOf("//*[local-name() = 'result']/measure[./@id = 'influence']/@class"));
|
||||
|
||||
assertEquals(
|
||||
"4.62970429725e-08", doc.valueOf("//*[local-name() = 'result']/measure[./@id = 'popularity']/@value"));
|
||||
"0.0", doc.valueOf("//*[local-name() = 'result']/measure[./@id = 'popularity_alt']/@score"));
|
||||
assertEquals(
|
||||
"C", doc.valueOf("//*[local-name() = 'result']/measure[./@id = 'popularity_alt']/@class"));
|
||||
|
||||
assertEquals(
|
||||
"3.11855618382e-09", doc.valueOf("//*[local-name() = 'result']/measure[./@id = 'popularity']/@score"));
|
||||
assertEquals(
|
||||
"C", doc.valueOf("//*[local-name() = 'result']/measure[./@id = 'popularity']/@class"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -142,7 +152,7 @@ public class XmlRecordFactoryTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testDatasource() throws IOException, DocumentException {
|
||||
public void testService() throws IOException, DocumentException {
|
||||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
|
@ -167,6 +177,14 @@ public class XmlRecordFactoryTest {
|
|||
assertEquals("true", doc.valueOf("//thematic"));
|
||||
assertEquals("Journal article", doc.valueOf("//contentpolicy/@classname"));
|
||||
assertEquals("Journal archive", doc.valueOf("//datasourcetypeui/@classname"));
|
||||
assertEquals("Data Source", doc.valueOf("//eosctype/@classname"));
|
||||
|
||||
final List pids = doc.selectNodes("//pid");
|
||||
assertEquals(1, pids.size());
|
||||
assertEquals("re3data", ((Element) pids.get(0)).attribute("classid").getValue());
|
||||
assertEquals(
|
||||
"Registry of research data repositories", ((Element) pids.get(0)).attribute("classname").getValue());
|
||||
assertEquals("dnet:pid_types", ((Element) pids.get(0)).attribute("schemeid").getValue());
|
||||
assertEquals("dnet:pid_types", ((Element) pids.get(0)).attribute("schemename").getValue());
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -5,7 +5,37 @@
|
|||
"unit": [
|
||||
{
|
||||
"key": "score",
|
||||
"value": "1.00889953098e-08"
|
||||
"value": "5.06690394631e-09",
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": true,
|
||||
"deletedbyinference": false,
|
||||
"trust": "",
|
||||
"inferenceprovenance": "update",
|
||||
"provenanceaction": {
|
||||
"classid": "measure:bip",
|
||||
"classname": "Inferred by OpenAIRE",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"key": "class",
|
||||
"value": "C",
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": true,
|
||||
"deletedbyinference": false,
|
||||
"trust": "",
|
||||
"inferenceprovenance": "update",
|
||||
"provenanceaction": {
|
||||
"classid": "measure:bip",
|
||||
"classname": "Inferred by OpenAIRE",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
@ -14,7 +44,37 @@
|
|||
"unit": [
|
||||
{
|
||||
"key": "score",
|
||||
"value": "30.6576853333"
|
||||
"value": "0.0",
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": true,
|
||||
"deletedbyinference": false,
|
||||
"trust": "",
|
||||
"inferenceprovenance": "update",
|
||||
"provenanceaction": {
|
||||
"classid": "measure:bip",
|
||||
"classname": "Inferred by OpenAIRE",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"key": "class",
|
||||
"value": "C",
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": true,
|
||||
"deletedbyinference": false,
|
||||
"trust": "",
|
||||
"inferenceprovenance": "update",
|
||||
"provenanceaction": {
|
||||
"classid": "measure:bip",
|
||||
"classname": "Inferred by OpenAIRE",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
@ -23,7 +83,115 @@
|
|||
"unit": [
|
||||
{
|
||||
"key": "score",
|
||||
"value": "4.62970429725e-08"
|
||||
"value": "3.11855618382e-09",
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": true,
|
||||
"deletedbyinference": false,
|
||||
"trust": "",
|
||||
"inferenceprovenance": "update",
|
||||
"provenanceaction": {
|
||||
"classid": "measure:bip",
|
||||
"classname": "Inferred by OpenAIRE",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"key": "class",
|
||||
"value": "C",
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": true,
|
||||
"deletedbyinference": false,
|
||||
"trust": "",
|
||||
"inferenceprovenance": "update",
|
||||
"provenanceaction": {
|
||||
"classid": "measure:bip",
|
||||
"classname": "Inferred by OpenAIRE",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "influence_alt",
|
||||
"unit": [
|
||||
{
|
||||
"key": "score",
|
||||
"value": "0.0",
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": true,
|
||||
"deletedbyinference": false,
|
||||
"trust": "",
|
||||
"inferenceprovenance": "update",
|
||||
"provenanceaction": {
|
||||
"classid": "measure:bip",
|
||||
"classname": "Inferred by OpenAIRE",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"key": "class",
|
||||
"value": "C",
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": true,
|
||||
"deletedbyinference": false,
|
||||
"trust": "",
|
||||
"inferenceprovenance": "update",
|
||||
"provenanceaction": {
|
||||
"classid": "measure:bip",
|
||||
"classname": "Inferred by OpenAIRE",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "impulse",
|
||||
"unit": [
|
||||
{
|
||||
"key": "score",
|
||||
"value": "0.0",
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": true,
|
||||
"deletedbyinference": false,
|
||||
"trust": "",
|
||||
"inferenceprovenance": "update",
|
||||
"provenanceaction": {
|
||||
"classid": "measure:bip",
|
||||
"classname": "Inferred by OpenAIRE",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"key": "class",
|
||||
"value": "C",
|
||||
"dataInfo": {
|
||||
"invisible": false,
|
||||
"inferred": true,
|
||||
"deletedbyinference": false,
|
||||
"trust": "",
|
||||
"inferenceprovenance": "update",
|
||||
"provenanceaction": {
|
||||
"classid": "measure:bip",
|
||||
"classname": "Inferred by OpenAIRE",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -81,3 +81,33 @@ where reltype='resultResult'
|
|||
and r1.resulttype.classname != 'other'
|
||||
and r2.resulttype.classname != 'other'
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
|
||||
|
||||
create table ${stats_db_name}.result_citations_oc stored as parquet as
|
||||
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||
where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations'
|
||||
and reltype='resultResult'
|
||||
and r1.resulttype.classname!=r2.resulttype.classname
|
||||
and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
|
||||
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
|
||||
and r1.resulttype.classname != 'other'
|
||||
and r2.resulttype.classname != 'other'
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||
group by substr(target, 4);
|
||||
|
||||
create table ${stats_db_name}.result_references_oc stored as parquet as
|
||||
select substr(source, 4) as id, count(distinct substr(target, 4)) as references
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||
where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations'
|
||||
and reltype='resultResult'
|
||||
and r1.resulttype.classname!=r2.resulttype.classname
|
||||
and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
|
||||
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
|
||||
and r1.resulttype.classname != 'other'
|
||||
and r2.resulttype.classname != 'other'
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||
group by substr(source, 4);
|
|
@ -12,6 +12,8 @@ and (ri.accessright = 'Open Access'
|
|||
or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp
|
||||
on p.id= tmp.id;
|
||||
|
||||
compute stats indi_pub_green_oa;
|
||||
|
||||
create table indi_pub_grey_lit stored as parquet as
|
||||
select distinct p.id, coalesce(grey_lit, 0) as grey_lit
|
||||
from publication p
|
||||
|
@ -22,6 +24,8 @@ join result_classifications rt on rt.id = p.id
|
|||
where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and
|
||||
not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id;
|
||||
|
||||
compute stats indi_pub_grey_lit;
|
||||
|
||||
create table indi_pub_doi_from_crossref stored as parquet as
|
||||
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
|
||||
from publication p
|
||||
|
@ -31,6 +35,7 @@ join datasource d on d.id = ri.collectedfrom
|
|||
where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp
|
||||
on tmp.id=p.id;
|
||||
|
||||
compute stats indi_pub_doi_from_crossref;
|
||||
---- Sprint 2 ----
|
||||
create table indi_result_has_cc_licence stored as parquet as
|
||||
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
|
||||
|
@ -40,6 +45,8 @@ join result_licenses as license on license.id = r.id
|
|||
where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
|
||||
on r.id= tmp.id;
|
||||
|
||||
compute stats indi_result_has_cc_licence;
|
||||
|
||||
create table indi_result_has_cc_licence_url stored as parquet as
|
||||
select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
|
||||
from result r
|
||||
|
@ -49,16 +56,21 @@ join result_licenses as license on license.id = r.id
|
|||
WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp
|
||||
on r.id= tmp.id;
|
||||
|
||||
compute stats indi_result_has_cc_licence_url;
|
||||
|
||||
create table indi_pub_has_abstract stored as parquet as
|
||||
select distinct publication.id, coalesce(abstract, 1) has_abstract
|
||||
from publication;
|
||||
|
||||
compute stats indi_pub_has_abstract;
|
||||
|
||||
create table indi_result_with_orcid stored as parquet as
|
||||
select distinct r.id, coalesce(has_orcid, 0) as has_orcid
|
||||
from result r
|
||||
left outer join (select id, 1 as has_orcid from result_orcid) tmp
|
||||
on r.id= tmp.id;
|
||||
|
||||
compute stats indi_result_with_orcid;
|
||||
|
||||
---- Sprint 3 ----
|
||||
create table indi_funded_result_with_fundref stored as parquet as
|
||||
|
@ -68,27 +80,33 @@ left outer join (select distinct id, 1 as fundref from project_results
|
|||
where provenance='Harvested') tmp
|
||||
on r.id= tmp.id;
|
||||
|
||||
create table indi_result_org_country_collab stored as parquet as
|
||||
with tmp as
|
||||
(select o.id as id, o.country , ro.id as result,r.type from organization o
|
||||
join result_organization ro on o.id=ro.organization
|
||||
join result r on r.id=ro.id where o.country <> 'UNKNOWN')
|
||||
select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.result=o2.result
|
||||
where o1.id<>o2.id and o1.country<>o2.country
|
||||
group by o1.id, o1.type,o2.country;
|
||||
compute stats indi_funded_result_with_fundref;
|
||||
|
||||
create table indi_result_org_collab stored as parquet as
|
||||
with tmp as
|
||||
(select o.id, ro.id as result,r.type from organization o
|
||||
join result_organization ro on o.id=ro.organization
|
||||
join result r on r.id=ro.id)
|
||||
select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.result=o2.result
|
||||
where o1.id<>o2.id
|
||||
group by o1.id, o2.id, o1.type;
|
||||
-- create table indi_result_org_country_collab stored as parquet as
|
||||
-- with tmp as
|
||||
-- (select o.id as id, o.country , ro.id as result,r.type from organization o
|
||||
-- join result_organization ro on o.id=ro.organization
|
||||
-- join result r on r.id=ro.id where o.country <> 'UNKNOWN')
|
||||
-- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
|
||||
-- from tmp as o1
|
||||
-- join tmp as o2 on o1.result=o2.result
|
||||
-- where o1.id<>o2.id and o1.country<>o2.country
|
||||
-- group by o1.id, o1.type,o2.country;
|
||||
--
|
||||
-- compute stats indi_result_org_country_collab;
|
||||
|
||||
-- create table indi_result_org_collab stored as parquet as
|
||||
-- with tmp as
|
||||
-- (select o.id, ro.id as result,r.type from organization o
|
||||
-- join result_organization ro on o.id=ro.organization
|
||||
-- join result r on r.id=ro.id)
|
||||
-- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
|
||||
-- from tmp as o1
|
||||
-- join tmp as o2 on o1.result=o2.result
|
||||
-- where o1.id<>o2.id
|
||||
-- group by o1.id, o2.id, o1.type;
|
||||
--
|
||||
-- compute stats indi_result_org_collab;
|
||||
|
||||
create table indi_funder_country_collab stored as parquet as
|
||||
with tmp as (select funder, project, country from organization_projects op
|
||||
|
@ -101,6 +119,8 @@ join tmp as f2 on f1.project=f2.project
|
|||
where f1.country<>f2.country
|
||||
group by f1.funder, f2.country, f1.country;
|
||||
|
||||
compute stats indi_funder_country_collab;
|
||||
|
||||
create table indi_result_country_collab stored as parquet as
|
||||
with tmp as
|
||||
(select country, ro.id as result,r.type from organization o
|
||||
|
@ -112,6 +132,8 @@ join tmp as o2 on o1.result=o2.result
|
|||
where o1.country<>o2.country
|
||||
group by o1.country, o2.country, o1.type;
|
||||
|
||||
compute stats indi_result_country_collab;
|
||||
|
||||
---- Sprint 4 ----
|
||||
create table indi_pub_diamond stored as parquet as
|
||||
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
|
||||
|
@ -123,6 +145,8 @@ join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_onli
|
|||
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
|
||||
on pd.id=tmp.id;
|
||||
|
||||
compute stats indi_pub_diamond;
|
||||
|
||||
create table indi_pub_hybrid stored as parquet as
|
||||
select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
|
||||
from publication_datasources pd
|
||||
|
@ -133,6 +157,8 @@ join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_onli
|
|||
and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp
|
||||
on pd.id=tmp.id;
|
||||
|
||||
compute stats indi_pub_hybrid;
|
||||
|
||||
create table indi_pub_in_transformative stored as parquet as
|
||||
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
|
||||
from publication pd
|
||||
|
@ -143,6 +169,8 @@ join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_onli
|
|||
and ps.is_transformative_journal=true) tmp
|
||||
on pd.id=tmp.id;
|
||||
|
||||
compute stats indi_pub_in_transformative;
|
||||
|
||||
create table indi_pub_closed_other_open stored as parquet as
|
||||
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri
|
||||
left outer join
|
||||
|
@ -153,11 +181,12 @@ where d.type like '%Journal%' and ri.accessright='Closed Access' and
|
|||
(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp
|
||||
on tmp.id=ri.id;
|
||||
|
||||
|
||||
compute stats indi_pub_closed_other_open;
|
||||
---- Sprint 5 ----
|
||||
create table indi_result_no_of_copies stored as parquet as
|
||||
select id, count(id) as number_of_copies from result_instance group by id;
|
||||
|
||||
compute stats indi_result_no_of_copies;
|
||||
---- Sprint 6 ----
|
||||
create table indi_pub_gold_oa stored as parquet as
|
||||
WITH gold_oa AS (
|
||||
|
@ -183,6 +212,8 @@ LEFT OUTER JOIN (
|
|||
JOIN issn on issn.id=pd.datasource
|
||||
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
|
||||
|
||||
compute stats indi_pub_gold_oa;
|
||||
|
||||
create table indi_datasets_gold_oa stored as parquet as
|
||||
WITH gold_oa AS (
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
|
||||
|
@ -210,6 +241,8 @@ LEFT OUTER JOIN (
|
|||
JOIN issn on issn.id=pd.datasource
|
||||
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
|
||||
|
||||
compute stats indi_datasets_gold_oa;
|
||||
|
||||
create table indi_software_gold_oa stored as parquet as
|
||||
WITH gold_oa AS (
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
|
||||
|
@ -237,6 +270,8 @@ LEFT OUTER JOIN (
|
|||
JOIN issn on issn.id=pd.datasource
|
||||
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
|
||||
|
||||
compute stats indi_software_gold_oa;
|
||||
|
||||
create table indi_org_findable stored as parquet as
|
||||
with result_with_pid as (
|
||||
select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
|
||||
|
@ -263,6 +298,8 @@ join result_with_pid_share on result_with_pid_share.organization=allresults.orga
|
|||
left outer join (
|
||||
select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization;
|
||||
|
||||
compute stats indi_org_findable;
|
||||
|
||||
create table indi_org_openess stored as parquet as
|
||||
WITH datasets_oa as (
|
||||
SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg
|
||||
|
@ -313,6 +350,8 @@ left outer join (
|
|||
left outer join (
|
||||
select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization;
|
||||
|
||||
compute stats indi_org_openess;
|
||||
|
||||
create table indi_pub_hybrid_oa_with_cc stored as parquet as
|
||||
WITH hybrid_oa AS (
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
|
||||
|
@ -343,6 +382,8 @@ LEFT OUTER JOIN (
|
|||
JOIN indi_result_has_cc_licence cc on pd.id=cc.id
|
||||
where cc.has_cc_license=1) tmp on pd.id=tmp.id;
|
||||
|
||||
compute stats indi_pub_hybrid_oa_with_cc;
|
||||
|
||||
create table indi_pub_downloads stored as parquet as
|
||||
SELECT result_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats
|
||||
join publication on result_id=id
|
||||
|
@ -350,6 +391,8 @@ where downloads>0
|
|||
GROUP BY result_id
|
||||
order by no_dowloads desc;
|
||||
|
||||
compute stats indi_pub_downloads;
|
||||
|
||||
create table indi_pub_downloads_datasource stored as parquet as
|
||||
SELECT result_id, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats
|
||||
join publication on result_id=id
|
||||
|
@ -357,15 +400,21 @@ where downloads>0
|
|||
GROUP BY result_id, repository_id
|
||||
order by result_id;
|
||||
|
||||
compute stats indi_pub_downloads_datasource;
|
||||
|
||||
create table indi_pub_downloads_year stored as parquet as
|
||||
SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us
|
||||
join publication on result_id=id where downloads>0
|
||||
GROUP BY result_id, `year`
|
||||
order by `year` asc;
|
||||
|
||||
compute stats indi_pub_downloads_year;
|
||||
|
||||
create table indi_pub_downloads_datasource_year stored as parquet as
|
||||
SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us
|
||||
join publication on result_id=id
|
||||
where downloads>0
|
||||
GROUP BY result_id, repository_id, `year`
|
||||
order by `year` asc, result_id;
|
||||
|
||||
compute stats indi_pub_downloads_datasource_year;
|
|
@ -18,28 +18,45 @@ create table TARGET.result stored as parquet as
|
|||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
|
||||
union all
|
||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||
'openorgs____::759d59f05d77188faee99b7493b46805',
|
||||
'openorgs____::b84450f9864182c67b8611b5593f4250',
|
||||
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975',
|
||||
'openorgs____::eadc8da90a546e98c03f896661a2e4d4',
|
||||
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2',
|
||||
'openorgs____::d169c7407dd417152596908d48c11460',
|
||||
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21',
|
||||
'openorgs____::2fb1e47b4612688d9de9169d579939a7',
|
||||
'openorgs____::759d59f05d77188faee99b7493b46805',
|
||||
'openorgs____::cad284878801b9465fa51a95b1d779db',
|
||||
'openorgs____::eadc8da90a546e98c03f896661a2e4d4',
|
||||
'openorgs____::c0286313e36479eff8676dba9b724b40'
|
||||
-- ,'openorgs____::c80a8243a5e5c620d7931c88d93bf17a' -- Paris Diderot
|
||||
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||
'openorgs____::2fb1e47b4612688d9de9169d579939a7', --University of Helsinki
|
||||
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||
'openorgs____::6445d7758d3a40c4d997953b6632a368' --National Institute of Informatics (NII)
|
||||
) )) foo;
|
||||
compute stats TARGET.result;
|
||||
|
||||
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.result_citations;
|
||||
|
||||
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.result_references_oc;
|
||||
|
||||
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.result_citations_oc;
|
||||
|
||||
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.result_classifications;
|
||||
|
||||
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.result_apc;
|
||||
|
||||
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.result_concepts;
|
||||
|
||||
|
@ -90,11 +107,6 @@ compute stats TARGET.result_sources;
|
|||
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.result_topics;
|
||||
|
||||
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.result_apc;
|
||||
|
||||
|
||||
|
||||
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
|
||||
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
|
||||
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
|
||||
|
|
|
@ -127,6 +127,7 @@ CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
|
|||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||
FROM ${openaire_db_name}.relation r
|
||||
WHERE r.reltype = 'resultOrganization'
|
||||
and r.target like '50|%'
|
||||
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
|
||||
|
|
|
@ -93,7 +93,7 @@ where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
|
|||
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
|
||||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||
FROM ${openaire_db_name}.relation r
|
||||
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
|
||||
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.target like '20|%' and r.datainfo.invisible=false;
|
||||
|
||||
-- datasource sources:
|
||||
-- where the datasource info have been collected from.
|
||||
|
|
2
pom.xml
2
pom.xml
|
@ -801,7 +801,7 @@
|
|||
<mockito-core.version>3.3.3</mockito-core.version>
|
||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||
<vtd.version>[2.12,3.0)</vtd.version>
|
||||
<dhp-schemas.version>[2.10.32]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[2.12.0]</dhp-schemas.version>
|
||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||
|
|
Loading…
Reference in New Issue