Merge branch 'beta' into monitoring

This commit is contained in:
Sandro La Bruzzo 2022-05-05 12:12:03 +02:00
commit e138b60d5c
32 changed files with 1987 additions and 319 deletions

View File

@ -3,6 +3,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.sql.Array;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
@ -118,6 +120,17 @@ public class OafMapperUtils {
.collect(Collectors.toList());
}
public static <T> List<T> listValues(Array values) throws SQLException {
if (Objects.isNull(values)) {
return null;
}
return Arrays
.stream((T[]) values.getArray())
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toList());
}
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
return values
.stream()

View File

@ -44,105 +44,104 @@ class OafMapperUtilsTest {
@Test
void testDateValidation() {
assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate(" 2016-04-05").isPresent());
assertNotNull(GraphCleaningFunctions.cleanDate("2016-05-07T12:41:19.202Z "));
assertNotNull(GraphCleaningFunctions.cleanDate("2020-09-10 11:08:52 "));
assertNotNull(GraphCleaningFunctions.cleanDate(" 2016-04-05"));
assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get());
assertEquals("2016-04-05", GraphCleaningFunctions.cleanDate("2016 Apr 05"));
assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 MST 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get());
assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get());
assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get());
assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get());
assertEquals("2009-05-08", GraphCleaningFunctions.cleanDate("May 8, 2009 5:57:51 PM"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, 1970"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, '70"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 1970"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 70"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 2 15:04:05 2006"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 2 15:04:05 MST 2006"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 02 15:04:05 -0700 2006"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Monday, 02-Jan-06 15:04:05 MST"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 MST"));
assertEquals("2017-07-11", GraphCleaningFunctions.cleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 -0700"));
assertEquals("2018-01-04", GraphCleaningFunctions.cleanDate("Thu, 4 Jan 2018 17:53:36 +0000"));
assertEquals("2015-08-10", GraphCleaningFunctions.cleanDate("Mon Aug 10 15:44:11 UTC+0100 2015"));
assertEquals(
"2015-07-03",
GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get());
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get());
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get());
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get());
assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get());
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get());
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get());
assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get());
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get());
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get());
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get());
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get());
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get());
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get());
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get());
assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get());
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get());
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get());
GraphCleaningFunctions.cleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)"));
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 10:09am"));
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 at 10:09am PST-08"));
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012, 10:10:09"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7, 1970"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7th, 1970"));
assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006, 19:17"));
assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006 19:17"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 70"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 1970"));
assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("03 February 2013"));
assertEquals("2013-07-01", GraphCleaningFunctions.cleanDate("1 July 2013"));
assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("2013-Feb-03"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3/31/2014"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03/31/2014"));
assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08/21/71"));
assertEquals("1971-01-08", GraphCleaningFunctions.cleanDate("8/1/71"));
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/2014 22:05"));
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("04/08/2014 22:05"));
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/14 22:05"));
assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("04/2/2014 03:00:51"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00:00 AM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00:01 PM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00 PM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 1:00 PM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00 AM"));
assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("4/02/2014 03:00:51"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59.3186369"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/3/31"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/03/31"));
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/4/8 22:05"));
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/04/08 22:05"));
assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/04/2 03:00:51"));
assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/4/02 03:00:51"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59.3186369"));
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014年04月08日"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("2006-01-02T15:04:05+0000"));
assertEquals("2009-08-13", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09-07:00"));
assertEquals("2009-08-12", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.3186369"));
assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.123"));
assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43"));
assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43:22"));
assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 UTC"));
assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 GMT"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 05:24:37 PM"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800 +08"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:44 +09:00"));
assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000 +0000 UTC"));
assertEquals("2015-09-30", GraphCleaningFunctions.cleanDate("2015-09-30 18:48:56.35272715 +0000 UTC"));
assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 GMT"));
assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 UTC"));
assertEquals(
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get());
"2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001"));
assertEquals(
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get());
assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get());
assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get());
assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get());
assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get());
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get());
assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get());
assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get());
assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get());
assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get());
"2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001"));
assertEquals("2017-07-19", GraphCleaningFunctions.cleanDate("2017-07-19 03:21:51+00:00"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26"));
assertEquals("2014-04-01", GraphCleaningFunctions.cleanDate("2014-04"));
assertEquals("2014-01-01", GraphCleaningFunctions.cleanDate("2014"));
assertEquals("2014-05-11", GraphCleaningFunctions.cleanDate("2014-05-11 08:20:13,787"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3.31.2014"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03.31.2014"));
assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08.21.71"));
assertEquals("2014-03-01", GraphCleaningFunctions.cleanDate("2014.03"));
assertEquals("2014-03-30", GraphCleaningFunctions.cleanDate("2014.03.30"));
assertEquals("2014-06-01", GraphCleaningFunctions.cleanDate("20140601"));
assertEquals("2014-07-22", GraphCleaningFunctions.cleanDate("20140722105203"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("1332151919"));
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367189"));
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222"));
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222333"));
}

View File

@ -284,4 +284,4 @@ object SparkGenerateDoiBoost {
.save(s"$workingDirPath/doiBoostOrganization")
}
}
}

View File

@ -280,10 +280,10 @@ case object Crossref2Oaf {
instance.setDateofacceptance(asField(createdDate.getValue))
}
val s: List[String] = List("https://doi.org/" + doi)
// val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct
// if (links.nonEmpty) {
// instance.setUrl(links.asJava)
// }
// val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct
// if (links.nonEmpty) {
// instance.setUrl(links.asJava)
// }
if (s.nonEmpty) {
instance.setUrl(s.asJava)
}
@ -584,12 +584,10 @@ case object Crossref2Oaf {
if (dp.length == 10) {
return GraphCleaningFunctions.cleanDate(dp)
}
}
else if (res.size ==2) {
} else if (res.size == 2) {
val dp = f"${res.head}-${res(1)}%02d-01"
return GraphCleaningFunctions.cleanDate(dp)
}
else if (res.size ==1) {
} else if (res.size == 1) {
return GraphCleaningFunctions.cleanDate(s"${res.head}-01-01")
}
}

View File

@ -73,10 +73,10 @@ class CrossrefMappingTest {
}
@Test
def crossrefIssueDateTest(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
val json =
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)

View File

@ -0,0 +1,243 @@
package eu.dnetlib.dhp.bulktag;
import static eu.dnetlib.dhp.PropagationConstant.readPath;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class SparkEoscTag {
private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final Qualifier EOSC_QUALIFIER = OafMapperUtils
.qualifier(
"eosc",
"European Open Science Cloud",
ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES);
public static final DataInfo EOSC_DATAINFO = OafMapperUtils
.dataInfo(
false, "propagation", true, false,
OafMapperUtils
.qualifier(
"propagation:subject", "Inferred by OpenAIRE",
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.9");
public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils
.structuredProperty(
"EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO);
public final static StructuredProperty EOSC_GALAXY = OafMapperUtils
.structuredProperty(
"EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO);
public final static StructuredProperty EOSC_TWITTER = OafMapperUtils
.structuredProperty(
"EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkEoscTag.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/input_eoscTag_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
execEoscTag(spark, inputPath, workingPath);
});
}
private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) {
readPath(spark, inputPath + "/software", Software.class)
.map((MapFunction<Software, Software>) s -> {
List<StructuredProperty> sbject;
if (!Optional.ofNullable(s.getSubject()).isPresent())
s.setSubject(new ArrayList<>());
sbject = s.getSubject();
if (containsCriteriaNotebook(s)) {
sbject.add(EOSC_NOTEBOOK);
}
if (containsCriteriaGalaxy(s)) {
sbject.add(EOSC_GALAXY);
}
return s;
}, Encoders.bean(Software.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/software");
readPath(spark, workingPath + "/software", Software.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath + "/software");
readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class)
.map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp -> {
List<StructuredProperty> sbject;
if (!Optional.ofNullable(orp.getSubject()).isPresent())
orp.setSubject(new ArrayList<>());
sbject = orp.getSubject();
if (containsCriteriaGalaxy(orp)) {
sbject.add(EOSC_GALAXY);
}
if (containscriteriaTwitter(orp)) {
sbject.add(EOSC_TWITTER);
}
return orp;
}, Encoders.bean(OtherResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/otherresearchproduct");
readPath(spark, workingPath + "/otherresearchproduct", OtherResearchProduct.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath + "/otherresearchproduct");
readPath(spark, inputPath + "/dataset", Dataset.class)
.map((MapFunction<Dataset, Dataset>) d -> {
List<StructuredProperty> sbject;
if (!Optional.ofNullable(d.getSubject()).isPresent())
d.setSubject(new ArrayList<>());
sbject = d.getSubject();
if (containscriteriaTwitter(d)) {
sbject.add(EOSC_TWITTER);
}
return d;
}, Encoders.bean(Dataset.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/dataset");
readPath(spark, workingPath + "/dataset", Dataset.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath + "/dataset");
}
private static boolean containscriteriaTwitter(Result r) {
Set<String> words = getWordsSP(r.getTitle());
words.addAll(getWordsF(r.getDescription()));
if (words.contains("twitter") &&
(words.contains("data") || words.contains("dataset")))
return true;
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data")))
return true;
return false;
}
private static boolean containsCriteriaGalaxy(Result r) {
Set<String> words = getWordsSP(r.getTitle());
words.addAll(getWordsF(r.getDescription()));
if (words.contains("galaxy") &&
words.contains("workflow"))
return true;
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow")))
return true;
return false;
}
private static boolean containsCriteriaNotebook(Software s) {
if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
return true;
if (s
.getSubject()
.stream()
.anyMatch(
sbj -> sbj.getValue().toLowerCase().contains("python") &&
sbj.getValue().toLowerCase().contains("notebook")))
return true;
if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("notebook")))
return true;
return false;
}
private static Set<String> getSubjects(List<StructuredProperty> s) {
Set<String> subjects = new HashSet<>();
s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" "))));
s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase()));
return subjects;
}
private static Set<String> getWordsSP(List<StructuredProperty> elem) {
Set<String> words = new HashSet<>();
Optional
.ofNullable(elem)
.ifPresent(
e -> e
.forEach(
t -> words
.addAll(
Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))));
return words;
}
private static Set<String> getWordsF(List<Field<String>> elem) {
Set<String> words = new HashSet<>();
Optional
.ofNullable(elem)
.ifPresent(
e -> e
.forEach(
t -> words
.addAll(
Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))));
// elem
// .forEach(
// t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
return words;
}
}

View File

@ -0,0 +1,21 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
}
]

View File

@ -204,7 +204,31 @@
<error to="Kill"/>
</action>
<join name="wait" to="End"/>
<join name="wait" to="eosc_tag"/>
<action name="eosc_tag">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>EOSC_tagging</name>
<class>eu.dnetlib.dhp.bulktag.SparkEoscTag</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscTag</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>

View File

@ -0,0 +1,547 @@
package eu.dnetlib.dhp.bulktag;
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.*;
public class EOSCTagJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(EOSCTagJobTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(EOSCTagJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(EOSCTagJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(EOSCTagJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void jupyterUpdatesTest() throws Exception {
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
.map(
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
Encoders.bean(Software.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/software");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
.map(
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/dataset");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
.map(
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
.readValue(value, OtherResearchProduct.class),
Encoders.bean(OtherResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/otherresearchproduct");
SparkEoscTag
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
workingDir.toString() + "/input",
"-workingPath", workingDir.toString() + "/working"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/input/software")
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
Assertions.assertEquals(10, tmp.count());
Assertions
.assertEquals(
4,
tmp
.filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count());
Assertions
.assertEquals(
2, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
5, tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertFalse(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
9, tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
5, tmp
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertFalse(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
9, tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
List<StructuredProperty> subjects = tmp
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
.collect()
.get(0)
.getSubject();
Assertions.assertEquals(8, subjects.size());
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("Modeling and Simulation")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("structure granulaire")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("algorithme")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("simulation numérique")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de gaz")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de liquide")));
Assertions
.assertEquals(
10, sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
.count());
Assertions
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
.filter(
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count());
Assertions
.assertEquals(
10, sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.count());
Assertions
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.filter(
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count());
// spark.stop();
}
@Test
void galaxyUpdatesTest() throws Exception {
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
.map(
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
Encoders.bean(Software.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/software");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
.map(
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/dataset");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
.map(
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
.readValue(value, OtherResearchProduct.class),
Encoders.bean(OtherResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/otherresearchproduct");
SparkEoscTag
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
workingDir.toString() + "/input",
"-workingPath", workingDir.toString() + "/working"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/input/software")
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
Assertions.assertEquals(11, tmp.count());
Assertions
.assertEquals(
2,
tmp
.filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
.count());
Assertions
.assertEquals(
2, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
6, tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
8, tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertFalse(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
JavaRDD<OtherResearchProduct> orp = sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
Assertions.assertEquals(10, orp.count());
Assertions
.assertEquals(
2,
orp
.filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
.count());
Assertions
.assertEquals(
3, orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
2, orp
.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertFalse(
orp
.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
3, orp
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
orp
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
}
@Test
void twitterUpdatesTest() throws Exception {
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/software").getPath())
.map(
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
Encoders.bean(Software.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/software");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/dataset").getPath())
.map(
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/dataset");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct").getPath())
.map(
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
.readValue(value, OtherResearchProduct.class),
Encoders.bean(OtherResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/otherresearchproduct");
SparkEoscTag
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
workingDir.toString() + "/input",
"-workingPath", workingDir.toString() + "/working"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/input/software")
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
Assertions.assertEquals(10, tmp.count());
Assertions
.assertEquals(
0,
tmp
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
.count());
JavaRDD<OtherResearchProduct> orp = sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
Assertions.assertEquals(10, orp.count());
Assertions
.assertEquals(
3,
orp
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
.count());
JavaRDD<Dataset> dats = sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(11, dats.count());
Assertions
.assertEquals(
3,
dats
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
.count());
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -27,15 +27,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.asString;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.io.Closeable;
import java.io.IOException;
@ -143,8 +135,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
smdbe.execute("queryClaims.sql", smdbe::processClaims);
break;
case openaire:
log.info("Processing datasources...");
smdbe.execute("queryDatasources.sql", smdbe::processDatasource, verifyNamespacePrefix);
log.info("Processing services...");
smdbe.execute("queryServices.sql", smdbe::processService, verifyNamespacePrefix);
log.info("Processing projects...");
if (dbSchema.equalsIgnoreCase("beta")) {
@ -156,10 +148,10 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
log.info("Processing Organizations...");
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization, verifyNamespacePrefix);
log.info("Processing relationsNoRemoval ds <-> orgs ...");
log.info("Processing relations services <-> orgs ...");
smdbe
.execute(
"queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization,
"queryServiceOrganization.sql", smdbe::processServiceOrganization,
verifyNamespacePrefix);
log.info("Processing projects <-> orgs ...");
@ -235,32 +227,30 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
dbClient.processResults(sql, consumer);
}
public List<Oaf> processDatasource(final ResultSet rs) {
public List<Oaf> processService(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final Datasource ds = new Datasource();
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
ds.setId(createOpenaireId(10, rs.getString("id"), true));
ds
.setOriginalId(
Arrays
.asList((String[]) rs.getArray("identities").getArray())
.asList((String[]) rs.getArray("originalid").getArray())
.stream()
.filter(StringUtils::isNotBlank)
.collect(Collectors.toList()));
ds
.setCollectedfrom(
listKeyValues(
createOpenaireId(10, rs.getString("collectedfromid"), true),
rs.getString("collectedfromname")));
ds.setPid(new ArrayList<>());
ds.setCollectedfrom(prepareCollectedfrom(rs.getArray("collectedfrom")));
ds.setPid(prepareListOfStructProps(rs.getArray("pid"), info));
ds.setDateofcollection(asString(rs.getDate("dateofcollection")));
ds.setDateoftransformation(null); // Value not returned by the SQL query
ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB
ds.setOaiprovenance(null); // Values not present in the DB
ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype")));
ds.setDatasourcetypeui(prepareQualifierSplitting(rs.getString("datasourcetypeui")));
ds.setEosctype(prepareQualifierSplitting(rs.getString("eosctype")));
ds.setEoscdatasourcetype(prepareQualifierSplitting(rs.getString("eoscdatasourcetype")));
ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility")));
ds.setOfficialname(field(rs.getString("officialname"), info));
ds.setEnglishname(field(rs.getString("englishname"), info));
@ -277,20 +267,19 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info));
ds.setOdpolicies(field(rs.getString("odpolicies"), info));
ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info));
ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info));
ds.setLanguages(listValues(rs.getArray("languages")));
ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info));
ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info));
ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info));
ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info));
ds.setDataprovider(field(rs.getBoolean("dataprovider"), info));
ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info));
ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info));
ds.setDatauploadtype(field(rs.getString("datauploadtype"), info));
ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info));
ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info));
ds.setVersioning(field(rs.getBoolean("versioning"), info));
ds.setVersioncontrol(rs.getBoolean("versioncontrol"));
ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info));
ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info));
ds.setPidsystems(field(rs.getString("pidsystems"), info));
ds.setCertificates(field(rs.getString("certificates"), info));
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
@ -299,13 +288,18 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
journal(
rs.getString("officialname"), rs.getString("issnPrinted"), rs.getString("issnOnline"),
rs.getString("issnLinking"), info)); // Journal
ds.setDataInfo(info);
ds.setLastupdatetimestamp(lastUpdateTimestamp);
ds.setResearchentitytypes(listValues(rs.getArray("researchentitytypes")));
ds.setJurisdiction(prepareQualifierSplitting(rs.getString("jurisdiction")));
ds.setThematic(rs.getBoolean("thematic"));
ds.setKnowledgegraph(rs.getBoolean("knowledgegraph"));
ds.setContentpolicies(prepareListOfQualifiers(rs.getArray("contentpolicies")));
ds.setSubmissionpolicyurl(rs.getString("submissionpolicyurl"));
ds.setPreservationpolicyurl(rs.getString("preservationpolicyurl"));
ds.setResearchproductaccesspolicies(listValues(rs.getArray("researchproductaccesspolicies")));
ds
.setResearchproductmetadataaccesspolicies(
listValues(rs.getArray("researchproductmetadataaccesspolicies")));
ds.setConsenttermsofuse(rs.getBoolean("consenttermsofuse"));
ds.setFulltextdownload(rs.getBoolean("fulltextdownload"));
ds
@ -313,8 +307,18 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
Optional
.ofNullable(
rs.getDate("consenttermsofusedate"))
.map(c -> c.toString())
.map(java.sql.Date::toString)
.orElse(null));
ds
.setLastconsenttermsofusedate(
Optional
.ofNullable(
rs.getDate("lastconsenttermsofusedate"))
.map(java.sql.Date::toString)
.orElse(null));
ds.setDataInfo(info);
ds.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(ds);
} catch (final Exception e) {
@ -425,11 +429,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
}
}
public List<Oaf> processDatasourceOrganization(final ResultSet rs) {
public List<Oaf> processServiceOrganization(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("organization"), true);
final String dsId = createOpenaireId(10, rs.getString("datasource"), true);
final String dsId = createOpenaireId(10, rs.getString("service"), true);
final List<KeyValue> collectedFrom = listKeyValues(
createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"));
@ -603,6 +607,32 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
String.format("%.3f", trust));
}
private List<KeyValue> prepareCollectedfrom(Array values) throws SQLException {
if (Objects.isNull(values)) {
return null;
}
return Arrays
.stream((String[]) values.getArray())
.filter(Objects::nonNull)
.distinct()
.map(s -> keyValueSplitting(s, "@@@"))
.collect(Collectors.toList());
}
public static KeyValue keyValueSplitting(final String s, String separator) {
if (StringUtils.isBlank(s)) {
return null;
}
final String[] arr = s.split(separator);
if (arr.length != 2) {
return null;
}
KeyValue kv = new KeyValue();
kv.setKey(createOpenaireId(10, arr[0], true));
kv.setValue(arr[1]);
return kv;
}
private Qualifier prepareQualifierSplitting(final String s) {
if (StringUtils.isBlank(s)) {
return null;

View File

@ -30,6 +30,11 @@
<value></value>
<description>a blacklist of nsprefixes (comma separeted)</description>
</property>
<property>
<name>reuseContent</name>
<value>false</value>
<description>reuse content in the aggregator database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -85,12 +90,20 @@
</configuration>
</global>
<start to="ImportDB"/>
<start to="reuse_db"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<decision name="reuse_db">
<switch>
<case to="ImportDB">${wf:conf('reuseContent') eq false}</case>
<case to="GenerateEntities">${wf:conf('reuseContent') eq true}</case>
<default to="ImportDB"/>
</switch>
</decision>
<action name="ImportDB">
<java>
<prepare>
@ -102,6 +115,7 @@
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--action</arg><arg>openaire</arg>
<arg>--dbschema</arg><arg>${dbSchema}</arg>
<arg>--nsPrefixBlacklist</arg><arg>${nsPrefixBlacklist}</arg>
</java>
@ -124,6 +138,55 @@
<arg>--action</arg><arg>claims</arg>
<arg>--nsPrefixBlacklist</arg><arg>${nsPrefixBlacklist}</arg>
</java>
<ok to="GenerateEntities"/>
<error to="Kill"/>
</action>
<action name="GenerateEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateEntities</name>
<class>eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePaths</arg><arg>${contentPath}/db_records,${contentPath}/db_claims</arg>
<arg>--targetPath</arg><arg>${workingDir}/entities</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--shouldHashId</arg><arg>true</arg>
</spark>
<ok to="GenerateGraph"/>
<error to="Kill"/>
</action>
<action name="GenerateGraph">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateGraph</name>
<class>eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/entities</arg>
<arg>--graphRawPath</arg><arg>${workingDir}/graph_aggregator</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>

View File

@ -27,7 +27,7 @@ SELECT
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid
FROM dsm_organizations o
LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom)
LEFT OUTER JOIN dsm_services d ON (d.id = o.collectedfrom)
LEFT OUTER JOIN dsm_organizationpids p ON (p.organization = o.id)
LEFT OUTER JOIN dsm_identities i ON (i.pid = p.pid)
GROUP BY

View File

@ -16,4 +16,4 @@ SELECT
FROM project_organization po
LEFT OUTER JOIN projects p ON (p.id = po.project)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom);
LEFT OUTER JOIN dsm_services dc ON (dc.id = p.collectedfrom);

View File

@ -42,7 +42,7 @@ SELECT
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
LEFT OUTER JOIN dsm_services dc ON (dc.id = p.collectedfrom)
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)

View File

@ -40,7 +40,7 @@ SELECT
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
LEFT OUTER JOIN dsm_services dc ON (dc.id = p.collectedfrom)
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)

View File

@ -1,5 +1,5 @@
SELECT
dor.datasource AS datasource,
dor.service AS service,
dor.organization AS organization,
NULL AS startdate,
NULL AS enddate,
@ -11,6 +11,6 @@ SELECT
dc.officialname AS collectedfromname,
'providedBy@@@dnet:datasources_organizations_typologies' AS semantics,
d.provenanceaction || '@@@dnet:provenanceActions' AS provenanceaction
FROM dsm_datasource_organization dor
LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom);
FROM dsm_service_organization dor
LEFT OUTER JOIN dsm_services d ON (dor.service = d.id)
LEFT OUTER JOIN dsm_services dc ON (dc.id = d.collectedfrom);

View File

@ -1,6 +1,7 @@
SELECT
d.id AS datasourceid,
d.id || array_agg(distinct di.pid) AS identities,
d.id AS id,
array_remove(d.id || array_agg(distinct CASE WHEN dp.pid like 'piwik%' THEN di.pid ELSE NULL END) || array_agg(distinct dds.duplicate), NULL) AS originalid,
array_remove(array_agg(distinct CASE WHEN di.pid NOT LIKE 'piwik%' THEN di.pid||'###'||di.issuertype||'@@@'||'dnet:pid_types' ELSE NULL END), NULL) as pid,
d.officialname AS officialname,
d.englishname AS englishname,
d.contactemail AS contactemail,
@ -9,8 +10,8 @@ SELECT
THEN
'openaire-cris_1.1@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire4.0'])
THEN
'openaire4.0@@@dnet:datasourceCompatibilityLevel'
THEN
'openaire4.0@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0'])
THEN
'driver-openaire2.0@@@dnet:datasourceCompatibilityLevel'
@ -40,25 +41,29 @@ SELECT
END AS openairecompatibility,
d.websiteurl AS websiteurl,
d.logourl AS logourl,
array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage,
array_remove(array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END), NULL) AS accessinfopackage,
d.latitude AS latitude,
d.longitude AS longitude,
d.namespaceprefix AS namespaceprefix,
NULL AS odnumberofitems,
NULL AS odnumberofitemsdate,
(SELECT array_agg(s|| '###keywords@@@dnet:subject_classification_typologies')
(SELECT array_agg(s|| '###keyword@@@dnet:subject_classification_typologies')
FROM UNNEST(
ARRAY(
SELECT trim(s)
FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects,
FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects,
d.description AS description,
NULL AS odpolicies,
ARRAY(SELECT trim(s)
FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages,
ARRAY(SELECT trim(s)
FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes,
array_remove(ARRAY(SELECT trim(s)
FROM unnest(string_to_array(regexp_replace(d.languages, '{|}|"', '', 'g'), ',')) AS s), '{}') AS odlanguages,
array_remove(ARRAY(SELECT trim(s)
FROM unnest(string_to_array(regexp_replace(d.languages, '{|}|"', '', 'g'), ',')) AS s), '{}') AS languages,
-- Term provided only by OpenDOAR:
-- probably updating the TR it could be replaced by research_entity_types[]
-- But a study on the vocabulary terms is needed
-- REMOVED: ARRAY(SELECT trim(s) FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes,
false AS inferred,
false AS deletedbyinference,
0.9 AS trust,
@ -69,39 +74,59 @@ SELECT
d.releasestartdate AS releasestartdate,
d.releaseenddate AS releaseenddate,
d.missionstatementurl AS missionstatementurl,
d.dataprovider AS dataprovider,
d.serviceprovider AS serviceprovider,
-- the following 2 fields (provided by re3data) have been replaced by research_entity_types[]
-- VALUE 'Research Data' : d.dataprovider AS dataprovider,
-- VALUE 'Services' : d.serviceprovider AS serviceprovider,
d.databaseaccesstype AS databaseaccesstype,
d.datauploadtype AS datauploadtype,
d.databaseaccessrestriction AS databaseaccessrestriction,
d.datauploadrestriction AS datauploadrestriction,
d.versioning AS versioning,
-- REPLACED BY version_control : d.versioning AS versioning,
d.version_control AS versioning,
d.version_control AS versioncontrol,
d.citationguidelineurl AS citationguidelineurl,
d.qualitymanagementkind AS qualitymanagementkind,
d.pidsystems AS pidsystems,
array_to_string(array_agg(distinct dps.scheme), ' ') AS pidsystems,
d.certificates AS certificates,
ARRAY[]::text[] AS policies,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
d.typology||'@@@dnet:datasource_typologies' AS datasourcetype,
d.typology||'@@@dnet:datasource_typologies_ui' AS datasourcetypeui,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
array_remove(
array(
select distinct cf
from unnest(
dc.id||'@@@'||dc.officialname || array_agg(distinct dds_cf.id||'@@@'||dds_cf.officialname)
) as cf),
NULL) AS collectedfrom,
d._typology_to_remove_||'@@@dnet:datasource_typologies' AS datasourcetype,
d._typology_to_remove_||'@@@dnet:datasource_typologies_ui' AS datasourcetypeui,
d.eosc_type||'@@@dnet:eosc_types' AS eosctype,
d.eosc_datasource_type||'@@@dnet:eosc_datasource_types' AS eoscdatasourcetype,
d.issn AS issnPrinted,
d.eissn AS issnOnline,
d.lissn AS issnLinking,
d.research_entity_types AS researchentitytypes,
d.consenttermsofuse AS consenttermsofuse,
d.fulltextdownload AS fulltextdownload,
d.consenttermsofusedate AS consenttermsofusedate,
de.jurisdiction||'@@@eosc:jurisdictions' AS jurisdiction,
de.thematic AS thematic,
de.knowledge_graph AS knowledgegraph,
array(select unnest(de.content_policies)||'@@@eosc:contentpolicies') AS contentpolicies
d.lastconsenttermsofusedate AS lastconsenttermsofusedate,
d.jurisdiction||'@@@eosc:jurisdictions' AS jurisdiction,
d.thematic AS thematic,
array(select unnest(d.content_policies)||'@@@eosc:contentpolicies') AS contentpolicies,
nullif(trim(d.submission_policy_url), '') AS submissionpolicyurl,
nullif(trim(d.preservation_policy_url), '') AS preservationpolicyurl,
array_remove(d.research_product_access_policies, '') AS researchproductaccesspolicies,
array_remove(d.research_product_metadata_access_policies, '') AS researchproductmetadataaccesspolicies
FROM dsm_datasources d
LEFT OUTER JOIN dsm_datasources_eosc de on (d.id = de.id)
LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id)
LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource)
LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource)
FROM dsm_services d
LEFT OUTER JOIN dsm_services dc on (d.collectedfrom = dc.id)
LEFT OUTER JOIN dsm_api a ON (d.id = a.service)
LEFT OUTER JOIN dsm_servicepids dp ON (d.id = dp.service)
LEFT OUTER JOIN dsm_identities di ON (dp.pid = di.pid)
LEFT OUTER JOIN dsm_dedup_services dds ON (d.id = dds.id)
LEFT OUTER JOIN dsm_services dds_dup ON (dds.duplicate = dds_dup.id)
LEFT OUTER JOIN dsm_services dds_cf ON (dds_dup.collectedfrom = dds_cf.id)
LEFT OUTER JOIN dsm_pid_systems dps ON (d.id = dps.service)
WHERE
d.dedup_main_service = true
GROUP BY
d.id,
@ -119,23 +144,27 @@ GROUP BY
d.releasestartdate,
d.releaseenddate,
d.missionstatementurl,
d.dataprovider,
d.serviceprovider,
-- TODO REMOVED ???: d.dataprovider,
-- TODO REMOVED ???: d.serviceprovider,
d.databaseaccesstype,
d.datauploadtype,
d.databaseaccessrestriction,
d.datauploadrestriction,
d.versioning,
-- REPLACED BY version_control : d.versioning,
d.version_control,
d.citationguidelineurl,
d.qualitymanagementkind,
d.pidsystems,
-- REMOVED: d.qualitymanagementkind,
d.certificates,
dc.id,
dc.officialname,
d.issn,
d.eissn,
d.lissn,
de.jurisdiction,
de.thematic,
de.knowledge_graph,
de.content_policies
d.jurisdiction,
d.thematic,
-- REMOVED ???: de.knowledge_graph,
d.content_policies,
d.submission_policy_url,
d.preservation_policy_url,
d.research_product_access_policies,
d.research_product_metadata_access_policies

View File

@ -12,8 +12,11 @@ import java.sql.Array;
import java.sql.Date;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
@ -28,12 +31,7 @@ import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ExtendWith(MockitoExtension.class)
@ -63,22 +61,32 @@ public class MigrateDbEntitiesApplicationTest {
}
@Test
public void testProcessDatasource() throws Exception {
final List<TypedField> fields = prepareMocks("datasources_resultset_entry.json");
public void testProcessService() throws Exception {
final List<TypedField> fields = prepareMocks("services_resultset_entry.json");
final List<Oaf> list = app.processDatasource(rs);
final List<Oaf> list = app.processService(rs);
assertEquals(1, list.size());
verifyMocks(fields);
final Datasource ds = (Datasource) list.get(0);
assertValidId(ds.getId());
assertValidId(ds.getCollectedfrom().get(0).getKey());
ds
.getCollectedfrom()
.stream()
.map(KeyValue::getKey)
.forEach(dsId -> assertValidId(dsId));
assertEquals(1, ds.getPid().size());
assertEquals("r3d100010218", ds.getPid().get(0).getValue());
assertEquals("re3data", ds.getPid().get(0).getQualifier().getClassid());
assertEquals("dnet:pid_types", ds.getPid().get(0).getQualifier().getSchemeid());
assertEquals(getValueAsString("officialname", fields), ds.getOfficialname().getValue());
assertEquals(getValueAsString("englishname", fields), ds.getEnglishname().getValue());
assertEquals(getValueAsString("contactemail", fields), ds.getContactemail().getValue());
assertEquals(getValueAsString("websiteurl", fields), ds.getWebsiteurl().getValue());
assertEquals(getValueAsString("logourl", fields), ds.getLogourl());
assertEquals(getValueAsString("contactemail", fields), ds.getContactemail().getValue());
assertEquals(getValueAsString("namespaceprefix", fields), ds.getNamespaceprefix().getValue());
assertEquals(getValueAsString("collectedfromname", fields), ds.getCollectedfrom().get(0).getValue());
assertEquals(getValueAsString("officialname", fields), ds.getJournal().getName());
assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted());
assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline());
@ -90,19 +98,98 @@ public class MigrateDbEntitiesApplicationTest {
assertEquals("pubsrepository::journal", ds.getDatasourcetypeui().getClassid());
assertEquals("dnet:datasource_typologies_ui", ds.getDatasourcetypeui().getSchemeid());
assertEquals("Data Source", ds.getEosctype().getClassid());
assertEquals("Data Source", ds.getEosctype().getClassname());
assertEquals("dnet:eosc_types", ds.getEosctype().getSchemeid());
assertEquals("dnet:eosc_types", ds.getEosctype().getSchemename());
assertEquals("Journal archive", ds.getEoscdatasourcetype().getClassid());
assertEquals("Journal archive", ds.getEoscdatasourcetype().getClassname());
assertEquals("dnet:eosc_datasource_types", ds.getEoscdatasourcetype().getSchemeid());
assertEquals("dnet:eosc_datasource_types", ds.getEoscdatasourcetype().getSchemename());
assertEquals("openaire4.0", ds.getOpenairecompatibility().getClassid());
assertEquals("openaire4.0", ds.getOpenairecompatibility().getClassname());
assertEquals("dnet:datasourceCompatibilityLevel", ds.getOpenairecompatibility().getSchemeid());
assertEquals("dnet:datasourceCompatibilityLevel", ds.getOpenairecompatibility().getSchemename());
assertEquals(getValueAsDouble("latitude", fields).toString(), ds.getLatitude().getValue());
assertEquals(getValueAsDouble("longitude", fields).toString(), ds.getLongitude().getValue());
assertEquals(getValueAsString("dateofvalidation", fields), ds.getDateofvalidation());
assertEquals(getValueAsString("description", fields), ds.getDescription().getValue());
// TODO assertEquals(getValueAsString("subjects", fields), ds.getSubjects());
assertEquals("0.0", ds.getOdnumberofitems().getValue());
assertEquals(getValueAsString("odnumberofitemsdate", fields), ds.getOdnumberofitemsdate());
assertEquals(getValueAsString("odpolicies", fields), ds.getOdpolicies());
assertEquals(
getValueAsList("odlanguages", fields),
ds.getOdlanguages().stream().map(Field::getValue).collect(Collectors.toList()));
assertEquals(getValueAsList("languages", fields), ds.getLanguages());
assertEquals(
getValueAsList("accessinfopackage", fields),
ds.getAccessinfopackage().stream().map(Field::getValue).collect(Collectors.toList()));
assertEquals(getValueAsString("releasestartdate", fields), ds.getReleasestartdate());
assertEquals(getValueAsString("releaseenddate", fields), ds.getReleasestartdate());
assertEquals(getValueAsString("missionstatementurl", fields), ds.getMissionstatementurl());
assertEquals(false, ds.getDataprovider().getValue());
assertEquals(false, ds.getServiceprovider().getValue());
assertEquals(getValueAsString("databaseaccesstype", fields), ds.getDatabaseaccesstype());
assertEquals(getValueAsString("datauploadtype", fields), ds.getDatauploadtype());
assertEquals(getValueAsString("databaseaccessrestriction", fields), ds.getDatabaseaccessrestriction());
assertEquals(getValueAsString("datauploadrestriction", fields), ds.getDatauploadrestriction());
assertEquals(false, ds.getVersioning().getValue());
assertEquals(false, ds.getVersioncontrol());
assertEquals(getValueAsString("citationguidelineurl", fields), ds.getCitationguidelineurl());
assertEquals(getValueAsString("pidsystems", fields), ds.getPidsystems());
assertEquals(getValueAsString("certificates", fields), ds.getCertificates());
assertEquals(getValueAsList("researchentitytypes", fields), ds.getResearchentitytypes());
assertEquals("National", ds.getJurisdiction().getClassid());
assertEquals("eosc:jurisdictions", ds.getJurisdiction().getSchemeid());
assertTrue(ds.getThematic());
assertTrue(ds.getKnowledgegraph());
assertEquals(1, ds.getContentpolicies().size());
assertEquals("Journal article", ds.getContentpolicies().get(0).getClassid());
assertEquals("eosc:contentpolicies", ds.getContentpolicies().get(0).getSchemeid());
HashSet<String> cpSchemeId = ds
.getContentpolicies()
.stream()
.map(Qualifier::getSchemeid)
.collect(Collectors.toCollection(HashSet::new));
assertTrue(cpSchemeId.size() == 1);
assertTrue(cpSchemeId.contains("eosc:contentpolicies"));
HashSet<String> cpSchemeName = ds
.getContentpolicies()
.stream()
.map(Qualifier::getSchemename)
.collect(Collectors.toCollection(HashSet::new));
assertTrue(cpSchemeName.size() == 1);
assertTrue(cpSchemeName.contains("eosc:contentpolicies"));
assertEquals(2, ds.getContentpolicies().size());
assertEquals("Taxonomic classification", ds.getContentpolicies().get(0).getClassid());
assertEquals("Resource collection", ds.getContentpolicies().get(1).getClassid());
assertEquals(getValueAsString("submissionpolicyurl", fields), ds.getSubmissionpolicyurl());
assertEquals(getValueAsString("preservationpolicyurl", fields), ds.getPreservationpolicyurl());
assertEquals(
getValueAsList("researchproductaccesspolicies", fields),
ds.getResearchproductaccesspolicies());
assertEquals(
getValueAsList("researchproductmetadataaccesspolicies", fields),
ds.getResearchproductmetadataaccesspolicies());
assertEquals(true, ds.getConsenttermsofuse());
assertEquals(true, ds.getFulltextdownload());
assertEquals("2022-03-11", ds.getConsenttermsofusedate());
assertEquals("2022-03-11", ds.getLastconsenttermsofusedate());
}
@Test
@ -154,7 +241,7 @@ public class MigrateDbEntitiesApplicationTest {
public void testProcessDatasourceOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("datasourceorganization_resultset_entry.json");
final List<Oaf> list = app.processDatasourceOrganization(rs);
final List<Oaf> list = app.processServiceOrganization(rs);
assertEquals(2, list.size());
verifyMocks(fields);
@ -356,18 +443,31 @@ public class MigrateDbEntitiesApplicationTest {
}
private Float getValueAsFloat(final String name, final List<TypedField> fields) {
return new Float(getValueAs(name, fields).toString());
final Object value = getValueAs(name, fields);
return value != null ? new Float(value.toString()) : null;
}
private Double getValueAsDouble(final String name, final List<TypedField> fields) {
final Object value = getValueAs(name, fields);
return value != null ? new Double(value.toString()) : null;
}
private Integer getValueAsInt(final String name, final List<TypedField> fields) {
final Object value = getValueAs(name, fields);
return value != null ? new Integer(value.toString()) : null;
}
private <T> T getValueAs(final String name, final List<TypedField> fields) {
return fields
final Optional<T> field = fields
.stream()
.filter(f -> f.getField().equals(name))
.map(TypedField::getValue)
.filter(Objects::nonNull)
.map(o -> (T) o)
.findFirst()
.get();
.map(TypedField::getValue)
.map(o -> (T) o);
if (!field.isPresent()) {
return null;
}
return field.get();
}
private List<String> getValueAsList(final String name, final List<TypedField> fields) {

View File

@ -5,14 +5,47 @@
"value": "274269ac6f3b::2579-5449"
},
{
"field": "identities",
"field": "originalid",
"type": "array",
"value": [
"274269ac6f3b::2579-5449",
"fairsharing_::1562",
"piwik:13",
null
null,
"re3data_____::r3d100010213"
]
},
{
"field": "pid",
"type": "array",
"value": [
"r3d100010218###re3data@@@dnet:pid_types"
]
},
{
"field": "datasourcetype",
"type": "string",
"value": "pubsrepository::journal@@@dnet:datasource_typologies"
},
{
"field": "datasourcetypeui",
"type": "string",
"value": "pubsrepository::journal@@@dnet:datasource_typologies_ui"
},
{
"field": "eosctype",
"type": "string",
"value": "Data Source@@@dnet:eosc_types"
},
{
"field": "eoscdatasourcetype",
"type": "string",
"value": "Journal archive@@@dnet:eosc_datasource_types"
},
{
"field": "openairecompatibility",
"type": "string",
"value": "openaire4.0@@@dnet:datasourceCompatibilityLevel"
},
{
"field": "officialname",
"type": "string",
@ -23,16 +56,6 @@
"type": "string",
"value": "Jurnal Ilmiah Pendidikan Scholastic"
},
{
"field": "contactemail",
"type": "string",
"value": "test@test.it"
},
{
"field": "openairecompatibility",
"type": "string",
"value": "hostedBy@@@dnet:datasourceCompatibilityLevel"
},
{
"field": "websiteurl",
"type": "string",
@ -44,11 +67,14 @@
"value": null
},
{
"field": "accessinfopackage",
"type": "array",
"value": [
null
]
"field": "contactemail",
"type": "string",
"value": "test@test.it"
},
{
"field": "namespaceprefix",
"type": "string",
"value": "ojs_25795449"
},
{
"field": "latitude",
@ -61,9 +87,19 @@
"value": 0
},
{
"field": "namespaceprefix",
"field": "dateofvalidation",
"type": "date",
"value": null
},
{
"field": "description",
"type": "string",
"value": "ojs_25795449"
"value": "veterinary medicine"
},
{
"field": "subjects",
"type": "array",
"value": []
},
{
"field": "odnumberofitems",
@ -75,16 +111,6 @@
"type": "date",
"value": null
},
{
"field": "subjects",
"type": "array",
"value": null
},
{
"field": "description",
"type": "string",
"value": null
},
{
"field": "odpolicies",
"type": "string",
@ -93,44 +119,33 @@
{
"field": "odlanguages",
"type": "array",
"value": []
},
{
"field": "odcontenttypes",
"type": "array",
"value": [
"Journal articles"
"English",
"German",
"French",
"Danish",
"Norwegian",
"Swedish"
]
},
{
"field": "inferred",
"type": "boolean",
"value": false
"field": "languages",
"type": "array",
"value": [
"English",
"German",
"French",
"Danish",
"Norwegian",
"Swedish"
]
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "double",
"value": 0.9
},
{
"field": "inferenceprovenance",
"type": "string",
"value": null
},
{
"field": "dateofcollection",
"type": "date",
"value": "2020-01-21"
},
{
"field": "dateofvalidation",
"type": "date",
"value": null
"field": "accessinfopackage",
"type": "array",
"value": [
"http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"
]
},
{
"field": "releasestartdate",
@ -147,16 +162,6 @@
"type": "string",
"value": null
},
{
"field": "dataprovider",
"type": "boolean",
"value": null
},
{
"field": "serviceprovider",
"type": "boolean",
"value": null
},
{
"field": "databaseaccesstype",
"type": "string",
@ -183,12 +188,12 @@
"value": null
},
{
"field": "citationguidelineurl",
"type": "string",
"field": "versioncontrol",
"type": "boolean",
"value": null
},
{
"field": "qualitymanagementkind",
"field": "citationguidelineurl",
"type": "string",
"value": null
},
@ -208,29 +213,38 @@
"value": []
},
{
"field": "collectedfromid",
"type": "string",
"value": "openaire____::SnVybmFsIEZha3VsdGFzIFNhc3RyYSBVbml2ZXJzaXRhcyBFa2FzYWt0aQ=="
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "collectedfromname",
"type": "string",
"value": "Jurnal Fakultas Sastra Universitas Ekasakti"
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "datasourcetype",
"type": "string",
"value": "pubsrepository::journal@@@dnet:datasource_typologies"
"field": "trust",
"type": "double",
"value": 0.9
},
{
"field": "datasourcetypeui",
"field": "inferenceprovenance",
"type": "string",
"value": "pubsrepository::journal@@@dnet:datasource_typologies_ui"
"value": null
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions"
"field": "dateofcollection",
"type": "date",
"value": "2020-01-21"
},
{
"field": "collectedfrom",
"type": "array",
"value": [
"openaire____::fairsharing@@@FAIRsharing.org",
"openaire____::opendoar@@@OpenDOAR",
"openaire____::re3data@@@Registry of Research Data Repository"
]
},
{
"field": "issnPrinted",
@ -247,6 +261,20 @@
"type": "string",
"value": "2579-5447"
},
{
"field": "researchentitytypes",
"type": "array",
"value": [
"Research Data"
]
},
{
"field": "providedproducttypes",
"type": "array",
"value": [
]
},
{
"field": "jurisdiction",
"type": "string",
@ -266,7 +294,32 @@
"field": "contentpolicies",
"type": "array",
"value": [
"Journal article@@@eosc:contentpolicies"
"Taxonomic classification@@@eosc:contentpolicies",
"Resource collection@@@eosc:contentpolicies"
]
},
{
"field": "submissionpolicyurl",
"type": "string",
"value": null
},
{
"field": "preservationpolicyurl",
"type": "string",
"value": "Permanent Archiving https://datadryad.org/stash/faq"
},
{
"field": "researchproductaccesspolicies",
"type": "array",
"value": [
"https://100percentit.com/legal/"
]
},
{
"field": "researchproductmetadataaccesspolicies",
"type": "array",
"value": [
"https://wenmr.science.uu.nl/conditions"
]
},
{
@ -283,5 +336,10 @@
"field": "consenttermsofusedate",
"type": "date",
"value": "2022-03-11"
},
{
"field": "lastconsenttermsofusedate",
"type": "date",
"value": "2022-03-11"
}
]

View File

@ -535,6 +535,12 @@ public class XmlRecordFactory implements Serializable {
if (ds.getDatasourcetypeui() != null) {
metadata.add(XmlSerializationUtils.mapQualifier("datasourcetypeui", ds.getDatasourcetypeui()));
}
if (ds.getEosctype() != null) {
metadata.add(XmlSerializationUtils.mapQualifier("eosctype", ds.getEosctype()));
}
if (ds.getEoscdatasourcetype() != null) {
metadata.add(XmlSerializationUtils.mapQualifier("eoscdatasourcetype", ds.getEoscdatasourcetype()));
}
if (ds.getOpenairecompatibility() != null) {
metadata
.add(
@ -583,6 +589,16 @@ public class XmlRecordFactory implements Serializable {
metadata
.add(XmlSerializationUtils.asXmlElement("description", ds.getDescription().getValue()));
}
if (ds.getSubjects() != null) {
metadata
.addAll(
ds
.getSubjects()
.stream()
.filter(Objects::nonNull)
.map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp))
.collect(Collectors.toList()));
}
if (ds.getOdnumberofitems() != null) {
metadata
.add(
@ -609,6 +625,16 @@ public class XmlRecordFactory implements Serializable {
.map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue()))
.collect(Collectors.toList()));
}
if (ds.getLanguages() != null) {
metadata
.addAll(
ds
.getLanguages()
.stream()
.filter(Objects::nonNull)
.map(c -> XmlSerializationUtils.asXmlElement("languages", c))
.collect(Collectors.toList()));
}
if (ds.getOdcontenttypes() != null) {
metadata
.addAll(
@ -689,18 +715,18 @@ public class XmlRecordFactory implements Serializable {
XmlSerializationUtils
.asXmlElement("versioning", ds.getVersioning().getValue().toString()));
}
if (ds.getVersioncontrol() != null) {
metadata
.add(
XmlSerializationUtils
.asXmlElement("versioncontrol", ds.getVersioncontrol().toString()));
}
if (ds.getCitationguidelineurl() != null) {
metadata
.add(
XmlSerializationUtils
.asXmlElement("citationguidelineurl", ds.getCitationguidelineurl().getValue()));
}
if (ds.getQualitymanagementkind() != null) {
metadata
.add(
XmlSerializationUtils
.asXmlElement("qualitymanagementkind", ds.getQualitymanagementkind().getValue()));
}
if (ds.getPidsystems() != null) {
metadata
.add(XmlSerializationUtils.asXmlElement("pidsystems", ds.getPidsystems().getValue()));
@ -722,17 +748,24 @@ public class XmlRecordFactory implements Serializable {
if (ds.getJournal() != null) {
metadata.add(XmlSerializationUtils.mapJournal(ds.getJournal()));
}
if (ds.getSubjects() != null) {
if (ds.getResearchentitytypes() != null) {
metadata
.addAll(
ds
.getSubjects()
.getResearchentitytypes()
.stream()
.filter(Objects::nonNull)
.map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp))
.map(c -> XmlSerializationUtils.asXmlElement("researchentitytypes", c))
.collect(Collectors.toList()));
}
if (ds.getProvidedproducttypes() != null) {
metadata
.addAll(
ds
.getProvidedproducttypes()
.stream()
.map(c -> XmlSerializationUtils.asXmlElement("providedproducttypes", c))
.collect(Collectors.toList()));
}
if (ds.getJurisdiction() != null) {
metadata.add(XmlSerializationUtils.mapQualifier("jurisdiction", ds.getJurisdiction()));
}
@ -741,11 +774,6 @@ public class XmlRecordFactory implements Serializable {
metadata.add(XmlSerializationUtils.asXmlElement("thematic", ds.getThematic().toString()));
}
if (ds.getKnowledgegraph() != null) {
metadata
.add(XmlSerializationUtils.asXmlElement("knowledgegraph", ds.getKnowledgegraph().toString()));
}
if (ds.getContentpolicies() != null) {
metadata
.addAll(
@ -756,7 +784,34 @@ public class XmlRecordFactory implements Serializable {
.map(q -> XmlSerializationUtils.mapQualifier("contentpolicy", q))
.collect(Collectors.toList()));
}
if (ds.getSubmissionpolicyurl() != null) {
metadata
.add(XmlSerializationUtils.asXmlElement("submissionpolicyurl", ds.getSubmissionpolicyurl()));
}
if (ds.getPreservationpolicyurl() != null) {
metadata
.add(
XmlSerializationUtils.asXmlElement("preservationpolicyurl", ds.getPreservationpolicyurl()));
}
if (ds.getResearchproductaccesspolicies() != null) {
metadata
.addAll(
ds
.getResearchproductaccesspolicies()
.stream()
.map(c -> XmlSerializationUtils.asXmlElement("researchproductaccesspolicies", c))
.collect(Collectors.toList()));
}
if (ds.getResearchproductmetadataaccesspolicies() != null) {
metadata
.addAll(
ds
.getResearchproductmetadataaccesspolicies()
.stream()
.map(
c -> XmlSerializationUtils.asXmlElement("researchproductmetadataaccesspolicies", c))
.collect(Collectors.toList()));
}
break;
case organization:
final Organization o = (Organization) entity;

View File

@ -11,6 +11,7 @@ import java.util.List;
import org.apache.commons.io.IOUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.Test;
@ -142,7 +143,7 @@ public class XmlRecordFactoryTest {
}
@Test
public void testDatasource() throws IOException, DocumentException {
public void testService() throws IOException, DocumentException {
final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
@ -167,6 +168,14 @@ public class XmlRecordFactoryTest {
assertEquals("true", doc.valueOf("//thematic"));
assertEquals("Journal article", doc.valueOf("//contentpolicy/@classname"));
assertEquals("Journal archive", doc.valueOf("//datasourcetypeui/@classname"));
assertEquals("Data Source", doc.valueOf("//eosctype/@classname"));
final List pids = doc.selectNodes("//pid");
assertEquals(1, pids.size());
assertEquals("re3data", ((Element) pids.get(0)).attribute("classid").getValue());
assertEquals(
"Registry of research data repositories", ((Element) pids.get(0)).attribute("classname").getValue());
assertEquals("dnet:pid_types", ((Element) pids.get(0)).attribute("schemeid").getValue());
assertEquals("dnet:pid_types", ((Element) pids.get(0)).attribute("schemename").getValue());
}
}

View File

@ -801,7 +801,7 @@
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.10.32]</dhp-schemas.version>
<dhp-schemas.version>[2.12.0]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>