From bde59a7c8f49cd964317a9e240dacb3be1eeec01 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 5 Dec 2024 11:09:30 +0100 Subject: [PATCH 1/5] implementation of the utilities for the inclusion of raids in the graph --- .../java/eu/dnetlib/dhp/common/Constants.java | 4 + .../dnetlib/dhp/actionmanager/Constants.java | 8 + .../raid/GenerateRAiDActionSetJob.java | 190 ++++++++++++++++++ .../raid/model/GenerateRAiDActionSetJob.java | 2 + .../actionmanager/raid/model/RAiDEntity.java | 102 ++++++++++ .../ror/GenerateRorActionSetJob.java | 8 +- .../raid/action_set_parameters.json | 14 ++ .../raid/oozie_app/action_set_parameters.json | 0 .../raid/oozie_app/config-default.xml | 58 ++++++ .../actionmanager/raid/oozie_app/workflow.xml | 55 +++++ .../raid/GenerateRAiDActionSetJobTest.java | 112 +++++++++++ .../dhp/actionmanager/raid/raid_example.json | 6 + 12 files changed, 552 insertions(+), 7 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/raid/raid_example.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java index 0477d6399d..b00199ea54 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java @@ -10,6 +10,10 @@ public class Constants { public static final Map accessRightsCoarMap = Maps.newHashMap(); public static final Map coarCodeLabelMap = Maps.newHashMap(); + public static final String RAID_NS_PREFIX = "raid________"; + public static final String RAID_DATASOURCE_NAME = "Research Activity Identifier Service (RAiD)"; + public static final String RAID_OPENAIRE_ID = ""; + public static final String ROR_NS_PREFIX = "ror_________"; public static final String ROR_OPENAIRE_ID = "10|openaire____::993a7ae7a863813cf95028b50708e222"; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index 73b4b77cb9..722415c2ed 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -3,6 +3,8 @@ package eu.dnetlib.dhp.actionmanager; import java.util.Optional; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -110,6 +112,12 @@ public class Constants { } + public static Instance getInstance(Qualifier qualifier) { + Instance instance = new Instance(); + instance.setInstancetype(qualifier); + return instance; + } + public static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java new file mode 100644 index 0000000000..8e5e1bdcb0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java @@ -0,0 +1,190 @@ +package eu.dnetlib.dhp.actionmanager.raid; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.Constants; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.utils.DHPUtils; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +import java.util.*; +import java.util.stream.Collectors; + +import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID; +import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; + +public class GenerateRAiDActionSetJob { + + private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static final List RAID_COLLECTED_FROM = listKeyValues( + OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); + + private static final Qualifier RAID_QUALIFIER = qualifier("raid:openaireinference", "raid:openaireinference", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); + + private static final DataInfo RAID_DATA_INFO = dataInfo( + false, OPENAIRE_DATASOURCE_NAME, true, false, RAID_QUALIFIER, "0.92"); + + public static void main(final String[] args) throws Exception { + + final String jsonConfiguration = IOUtils + .toString( + eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class + .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); + processRAiDEntities(spark, inputPath, outputPath); + }); + } + + private static void removeOutputDir(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + + static void processRAiDEntities(final SparkSession spark, + final String inputPath, + final String outputPath) { + readInputPath(spark, inputPath) + .map(GenerateRAiDActionSetJob::prepareRAiD) + .flatMap(List::iterator) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + + } + + protected static List> prepareRAiD(final RAiDEntity r) { + + final Date now = new Date(); + final OtherResearchProduct orp = new OtherResearchProduct(); + final List> res = new ArrayList<>(); + String raidId = calculateOpenaireId(r.getRaid()); + + orp.setId(raidId); + orp.setCollectedfrom(RAID_COLLECTED_FROM); + orp.setDataInfo(RAID_DATA_INFO); + orp.setResourcetype(RAID_QUALIFIER); + orp.setTitle( + Collections.singletonList( + structuredProperty( + r.getTitle(), + qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE), + RAID_DATA_INFO)) + ); + orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary())); + orp.setAuthor(createAuthors(r.getAuthors())); + orp.setInstance(Collections.singletonList(eu.dnetlib.dhp.actionmanager.Constants.getInstance(RAID_QUALIFIER))); + orp.setSubject( + r.getSubjects() + .stream() + .map(s -> subject(s, qualifier(DNET_SUBJECT_KEYWORD, DNET_SUBJECT_KEYWORD, DNET_SUBJECT_TYPOLOGIES, DNET_SUBJECT_TYPOLOGIES), RAID_DATA_INFO)) + .collect(Collectors.toList()) + ); + orp.setRelevantdate( + Arrays.asList( + structuredProperty(r.getEndDate(), qualifier("endDate","endDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO), + structuredProperty(r.getStartDate(), qualifier("startDate", "startDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO) + ) + ); + orp.setLastupdatetimestamp(now.getTime()); + orp.setDateofcollection(r.getStartDate()); + + res.add(new AtomicAction<>(OtherResearchProduct.class, orp)); + + for(String resultId: r.getIds()) { + Relation rel1 = OafMapperUtils.getRelation( + raidId, + resultId, + ModelConstants.RESULT_RESULT, + ModelConstants.OUTCOME, + PART, + RAID_COLLECTED_FROM, + RAID_DATA_INFO, + now.getTime(), + null, + null + ); + Relation rel2 = OafMapperUtils.getRelation( + resultId, + raidId, + ModelConstants.RESULT_RESULT, + ModelConstants.OUTCOME, + IS_PART_OF, + RAID_COLLECTED_FROM, + RAID_DATA_INFO, + now.getTime(), + null, + null + ); + res.add(new AtomicAction<>(Relation.class, rel1)); + res.add(new AtomicAction<>(Relation.class, rel2)); + } + + return res; + } + + public static String calculateOpenaireId(final String raid) { + return String.format("50|%s::%s", Constants.RAID_NS_PREFIX, DHPUtils.md5(raid)); + } + + public static List createAuthors(final List author) { + return author.stream().map(s-> { + Author a = new Author(); + a.setFullname(s); + return a; + }).collect(Collectors.toList()); + } + + private static JavaRDD readInputPath( + final SparkSession spark, + final String path) { + + return spark + .read() + .json(path) + .as(Encoders.bean(RAiDEntity.class)) + .toJavaRDD(); + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java new file mode 100644 index 0000000000..b0aec71d3f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java @@ -0,0 +1,2 @@ +package eu.dnetlib.dhp.actionmanager.raid.model;public class GenerateRAiDActionSetJob { +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java new file mode 100644 index 0000000000..bd7e28926e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java @@ -0,0 +1,102 @@ +package eu.dnetlib.dhp.actionmanager.raid.model; + +import java.io.Serializable; +import java.util.List; + +public class RAiDEntity implements Serializable { + + String raid; + List authors; + String startDate; + String endDate; + List subjects; + List titles; + List ids; + String title; + String summary; + + public RAiDEntity(){} + public RAiDEntity(String raid, List authors, String startDate, String endDate, List subjects, List titles, List ids, String title, String summary) { + this.raid = raid; + this.authors = authors; + this.startDate = startDate; + this.endDate = endDate; + this.subjects = subjects; + this.titles = titles; + this.ids = ids; + this.title = title; + this.summary = summary; + } + + public String getRaid() { + return raid; + } + + public void setRaid(String raid) { + this.raid = raid; + } + + public List getAuthors() { + return authors; + } + + public void setAuthors(List authors) { + this.authors = authors; + } + + public String getStartDate() { + return startDate; + } + + public void setStartDate(String startDate) { + this.startDate = startDate; + } + + public String getEndDate() { + return endDate; + } + + public void setEndDate(String endDate) { + this.endDate = endDate; + } + + public List getSubjects() { + return subjects; + } + + public void setSubjects(List subjects) { + this.subjects = subjects; + } + + public List getTitles() { + return titles; + } + + public void setTitles(List titles) { + this.titles = titles; + } + + public List getIds() { + return ids; + } + + public void setIds(List ids) { + this.ids = ids; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getSummary() { + return summary; + } + + public void setSummary(String summary) { + this.summary = summary; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java index 5f3493d567..ce1973a7f5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java @@ -21,6 +21,7 @@ import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.oaf.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -44,13 +45,6 @@ import eu.dnetlib.dhp.common.Constants; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json new file mode 100644 index 0000000000..2049630d2b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json @@ -0,0 +1,14 @@ +[ + { + "paramName": "i", + "paramLongName": "inputPath", + "paramDescription": "the path of the input json", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/config-default.xml new file mode 100644 index 0000000000..a1755f329b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + oozie.launcher.mapreduce.user.classpath.first + true + + + sparkExecutorNumber + 4 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml new file mode 100644 index 0000000000..9b5aa5905e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml @@ -0,0 +1,55 @@ + + + + raidJsonInputPath + the path of the json + + + raidActionSetPath + path where to store the action set + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + yarn + cluster + ProcessRAiDFile + eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob + dhp-aggregation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --inputPath${raidJsonInputPath} + --outputPath${raidActionSetPath} + + + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java new file mode 100644 index 0000000000..1f33f45b2b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java @@ -0,0 +1,112 @@ +package eu.dnetlib.dhp.actionmanager.raid; + +import eu.dnetlib.dhp.actionmanager.opencitations.CreateOpenCitationsASTest; +import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Relation; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import scala.Tuple2; + +import java.io.File; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; + +import static eu.dnetlib.dhp.actionmanager.Constants.OBJECT_MAPPER; +import static java.nio.file.Files.createTempDirectory; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class GenerateRAiDActionSetJobTest { + private static String input_path; + private static String output_path; + static SparkSession spark; + + @BeforeEach + void setUp() throws Exception { + + input_path = Paths + .get(GenerateRAiDActionSetJobTest.class.getResource("/eu/dnetlib/dhp/actionmanager/raid/raid_example.json").toURI()) + .toFile() + .getAbsolutePath(); + + output_path = createTempDirectory(GenerateRAiDActionSetJobTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); + + SparkConf conf = new SparkConf(); + conf.setAppName(GenerateRAiDActionSetJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", output_path); + conf.set("hive.metastore.warehouse.dir", output_path); + + spark = SparkSession + .builder() + .appName(GenerateRAiDActionSetJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + static void cleanUp() throws Exception { + FileUtils.deleteDirectory(new File(output_path)); + } + + @Test + @Disabled + void testProcessRAiDEntities() { + GenerateRAiDActionSetJob.processRAiDEntities(spark, input_path, output_path + "/test_raid_action_set"); + + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD result = sc + .sequenceFile(output_path + "/test_raid_action_set", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(AtomicAction::getPayload); + + assertEquals(80, result.count()); + } + + @Test + void testPrepareRAiD() { + + List> atomicActions = GenerateRAiDActionSetJob.prepareRAiD(new RAiDEntity( + "-92190526", + Arrays.asList("Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura", "Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume", "Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont", "Maïeul GRUGET", "Cécile Duchêne"), + "2021-09-10", + "2024-02-16", + Arrays.asList("cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps", "pan-scalar map", "Python library", "QGIS", "map design", "landmarks", "Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]", "[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography", "eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency", "General Medicine", "Geography, Planning and Development", "multi-scales", "pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences", "progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design", "cartography, map generalisation, zoom, multi-scale map", "Interactive maps", "Map generalisation", "Earth and Planetary Sciences (miscellaneous)", "Cartographic generalization", "rivers", "Benchmark", "General Environmental Science", "open source", "drawing", "Constraint", "Multi-scale maps"), + Arrays.asList("Where do people look at during multi-scale map tasks?", "FogDetector survey raw data", "Collection of cartographic disorientation stories", "Anchorwhat dataset", "BasqueRoads: A Benchmark for Road Network Selection", "Progressive river network selection for pan-scalar maps", "BasqueRoads, a dataset to benchmark road selection algorithms", "Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps", "Empirical approach to advance the generalisation of multi-scale maps", "L'Alpe d'Huez: a dataset to benchmark topographic map generalisation", "eye-tracking data from a survey on zooming in a pan-scalar map", "Material of the experiment 'More is Less' from the MapMuxing project", "Cartagen4py, an open source Python library for map generalisation", "L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"), + Arrays.asList("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", "50|doi_dedup___::754e3c283639bc6e104c925ff3e34007", "50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0", "50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a", "50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153", "50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a", "50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13", "50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4", "50|doi_dedup___::a9bc4453273b2d02648a5cb453195042", "50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7", "50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5", "50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283", "50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea", "50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"), + "Exploring Multi-Scale Map Generalization and Design", + "This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval." + )); + + OtherResearchProduct orp = (OtherResearchProduct) atomicActions.get(0).getPayload(); + Relation rel = (Relation) atomicActions.get(1).getPayload(); + + assertEquals("Exploring Multi-Scale Map Generalization and Design", orp.getTitle().get(0).getValue()); + assertEquals("50|raid________::759a564ce5cc7360cab030c517c7366b", rel.getSource()); + assertEquals("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", rel.getTarget()); + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/raid/raid_example.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/raid/raid_example.json new file mode 100644 index 0000000000..7694b605c3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/raid/raid_example.json @@ -0,0 +1,6 @@ +{"raid": "-9222092103004099540", "authors": ["Department of Archaeology & Museums", "Department of Archaeology and Museums", "Department Of Archaeology & Museums"], "subjects": ["Begamganj", "Raisen", "Bhopal", "Budhni", "Malwa site survey", "सीहोर", "Gauharganj", "बुधनी", "Budni", "Berasia"], "titles": ["Malwa site survey : Raisen District, Begamganj Tahsīl, photographic documentation", "Malwa site survey : Bhopal District, photographic documentation (version 1, TIFF files)", "Malwa site survey : Raisen District, Gauharganj Tahsīl, village finds", "Malwa site survey : Sehore सीहोर District, Budni Tahsīl, photographic documentation (part 1)", "Malwa site survey: Bhopal District, Berasia Tahsīl, photographic documentation (with villages named)", "Malwa site survey : Sehore सीहोर District, Budni Tahsīl, photographic documentation (part 2)", "Malwa site survey : Bhopal District, photographic documentation (version 2, JPEG files)"], "ids": ["50|doi_dedup___::7523d165970830dd857e6cbea4302adf", "50|doi_dedup___::02309ae8a9fae291df321e317f5c5330", "50|doi_dedup___::95347ba2c4264414fab39712ee7fe481", "50|doi_dedup___::970aa708fe667596754fd02a708780f5", "50|doi_dedup___::b7cd9128cc53b1257a4f000347f339b0", "50|doi_dedup___::c7d65da0ecedef4d2c702b9db197d90c", "50|doi_dedup___::addbb67cf5046e340f342ba091bcebfa"], "title": "Documentation of Malwa Region", "summary": "This project involves the documentation of the Malwa region through photographic surveys. The surveys were conducted by the Department of Archaeology and Museums, Madhya Pradesh, and cover various districts and tahsils. The documentation includes photographic records of sites, villages, and other relevant features. The project aims to provide a comprehensive understanding of the region's cultural and historical significance.", "startDate": "2019-03-06", "endDate": "2019-03-08"} +{"raid": "-9221424331076109424", "authors": ["Hutchings, Judy", "Ward, Catherine", "Baban, Adriana", "D��nil��, Ingrid", "Frantz, Inga", "Gardner, Frances", "Lachman, Jamie", "Lachman, Jamie M.", "Foran, Heather", "Heinrichs, Nina", "Murphy, Hugh", "B��ban, Adriana", "Raleva, Marija", "Fang, Xiangming", "Jansen, Elena", "Taut, Diana", "Foran, Heather M.", "T��ut, Diana", "Ward, Catherine L.", "Williams, Margiad", "Lesco, Galina", "Brühl, Antonia"], "subjects": ["3. Good health", "5. Gender equality", "Criminology not elsewhere classified", "1. No poverty", "2. Zero hunger"], "titles": ["sj-docx-1-vaw-10.1177_10778012231188090 - Supplemental material for Co-Occurrence of Intimate Partner Violence Against Mothers and Maltreatment of Their Children With Behavioral Problems in Eastern Europe", "Hunger in vulnerable families in Southeastern Europe: Associations with health and violence", "Prevention of child mental health problems through parenting interventions in Southeastern Europe (RISE): study protocol for a multi-site randomised controlled trial"], "ids": ["50|doi_dedup___::a70015063e5400dae2e097ee10b4a589", "50|doi_dedup___::6e1d12026fcde9087724622ccdeed430", "50|doi_dedup___::5b7bd5d46c5d95e2ef5b36663504a67e"], "title": "Exploring the Impact of Hunger and Violence on Child Health in Southeastern Europe", "summary": "This study aims to investigate the relationship between hunger, violence, and child health in vulnerable families in Southeastern Europe. The research will explore the experiences of families in FYR Macedonia, Republic of Moldova, and Romania, and examine the associations between hunger, maltreatment, and other health indicators. The study will also test the efficacy of a parenting intervention targeting child behavioral problems in alleviating these issues. The findings of this research will contribute to the development of effective interventions to address the complex needs of vulnerable families in the region.", "startDate": "2019-06-04", "endDate": "2023-01-01"} +{"raid": "-9219052635741785098", "authors": ["Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura", "Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume", "Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont", "Maïeul GRUGET", "Cécile Duchêne"], "subjects": ["cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps", "pan-scalar map", "Python library", "QGIS", "map design", "landmarks", "Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]", "[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography", "eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency", "General Medicine", "Geography, Planning and Development", "multi-scales", "pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences", "progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design", "cartography, map generalisation, zoom, multi-scale map", "Interactive maps", "Map generalisation", "Earth and Planetary Sciences (miscellaneous)", "Cartographic generalization", "rivers", "Benchmark", "General Environmental Science", "open source", "drawing", "Constraint", "Multi-scale maps"], "titles": ["Where do people look at during multi-scale map tasks?", "FogDetector survey raw data", "Collection of cartographic disorientation stories", "Anchorwhat dataset", "BasqueRoads: A Benchmark for Road Network Selection", "Progressive river network selection for pan-scalar maps", "BasqueRoads, a dataset to benchmark road selection algorithms", "Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps", "Empirical approach to advance the generalisation of multi-scale maps", "L'Alpe d'Huez: a dataset to benchmark topographic map generalisation", "eye-tracking data from a survey on zooming in a pan-scalar map", "Material of the experiment \"More is Less\" from the MapMuxing project", "Cartagen4py, an open source Python library for map generalisation", "L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"], "ids": ["50|doi_dedup___::6915135e0aa39f913394513f809ae58a", "50|doi_dedup___::754e3c283639bc6e104c925ff3e34007", "50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0", "50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a", "50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153", "50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a", "50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13", "50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4", "50|doi_dedup___::a9bc4453273b2d02648a5cb453195042", "50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7", "50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5", "50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283", "50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea", "50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"], "title": "Exploring Multi-Scale Map Generalization and Design", "summary": "This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval.", "startDate": "2021-09-10", "endDate": "2024-02-16"} +{"raid": "-9216828847055450272", "authors": ["Grey, Alan", "Gorelov, Sergey", "Pall, Szilard", "Merz, Pascal", "Justin A., Lemkul", "Szilárd Páll", "Pasquadibisceglie, Andrea", "Kutzner, Carsten", "Schulz, Roland", "Nabet, Julien", "Abraham, Mark", "Jalalypour, Farzaneh", "Lundborg, Magnus", "Gray, Alan", "Villa, Alessandra", "Berk Hess", "Santuz, Hubert", "Irrgang, M. Eric", "Wingbermuehle, Sebastian", "Lemkul, Justin A.", "Jordan, Joe", "Pellegrino, Michele", "Doijade, Mahesh", "Shvetsov, Alexey", "Hess, Berk", "Behera, Sudarshan", "Andrey Alekseenko", "Shugaeva, Tatiana", "Fleischmann, Stefan", "Bergh, Cathrine", "Morozov, Dmitry", "Adam Hospital", "Briand, Eliane", "Lindahl, Erik", "Brown, Ania", "Marta Lloret Llinares", "Miletic, Vedran", "Alekseenko, Andrey", "Gouaillardet, Gilles", "Fiorin, Giacomo", "Basov, Vladimir"], "subjects": ["webinar"], "titles": ["Introduction to HPC: molecular dynamics simulations with GROMACS: log files", "BioExcel webinar #73: Competency frameworks to support training design and professional development", "Introduction to HPC: molecular dynamics simulations with GROMACS: output files - Devana", "GROMACS 2024.0 Manual", "BioExcel Webinar #71: GROMACS-PMX for accurate estimation of free energy differences", "Introduction to HPC: molecular dynamics simulations with GROMACS: input files", "BioExcel Webinar #68: What's new in GROMACS 2023", "BioExcel Webinar #69: BioBB-Wfs and BioBB-API, integrated web-based platform and programmatic interface for biomolecular simulations workflows using the BioExcel Building Blocks library", "GROMACS 2024-beta Source code"], "ids": ["50|doi_dedup___::8318fbc815ee1943c3269be7567f220b", "50|doi_dedup___::9530e03fb2aac63e82b18a40dc09e32c", "50|doi_dedup___::30174ab31075e76a428ca5b4f4d236b8", "50|doi_________::70b7c6dce09ae6f1361d22913fdf95eb", "50|doi_dedup___::337dd48600618f3c06257edd750d6201", "50|doi_dedup___::d622992ba9077617f37ebd268b3e806d", "50|doi_dedup___::0b0bcc6825d6c052c37882fd5cfc1e8c", "50|doi_dedup___::4b1541a7cee32527c65ace5d1ed57335", "50|doi_dedup___::1379861df59bd755e4fb39b9f95ffbd3"], "title": "Exploring High-Performance Computing and Biomolecular Simulations", "summary": "This project involves exploring high-performance computing (HPC) and biomolecular simulations using GROMACS. The objectives include understanding molecular dynamics simulations, log files, input files, and output files. Additionally, the project aims to explore competency frameworks for professional development, specifically in the field of computational biomolecular research. The tools and techniques used will include GROMACS, BioExcel Building Blocks, and competency frameworks. The expected outcomes include a deeper understanding of HPC and biomolecular simulations, as well as the development of skills in using GROMACS and BioExcel Building Blocks. The project will also contribute to the development of competency frameworks for professional development in the field of computational biomolecular research.", "startDate": "2023-04-25", "endDate": "2024-01-30"} +{"raid": "-9210544816395499758", "authors": ["Bateson, Melissa", "Andrews, Clare", "Verhulst, Simon", "Nettle, Daniel", "Zuidersma, Erica"], "subjects": ["2. Zero hunger"], "titles": ["Exposure to food insecurity increases energy storage and reduces somatic maintenance in European starlings", "Data and code archive for Andrews et al. 'Exposure to food insecurity increases energy storage and reduces somatic maintenance in European starlings'"], "ids": ["50|doi_dedup___::176117239be06189523c253e0ca9c5ec", "50|doi_dedup___::343e0b0ddf0d54763a89a62af1f7a379"], "title": "Investigating the Effects of Food Insecurity on Energy Storage and Somatic Maintenance in European Starlings", "summary": "This study examines the impact of food insecurity on energy storage and somatic maintenance in European starlings. The research involved exposing juvenile starlings to either uninterrupted food availability or a regime of unpredictable food unavailability. The results show that birds exposed to food insecurity stored more energy, but at the expense of somatic maintenance and repair. The study provides insights into the adaptive responses of birds to food scarcity and the trade-offs involved in energy storage and maintenance.", "startDate": "2021-06-28", "endDate": "2021-06-28"} +{"raid": "-9208499171224730388", "authors": ["Maniati, Eleni", "Bakker, Bjorn", "McClelland, Sarah E.", "Shaikh, Nadeem", "De Angelis, Simone", "Johnson, Sarah C.", "Wang, Jun", "Foijer, Floris", "Spierings, Diana C. J.", "Boemo, Michael A.", "Wardenaar, René", "Mazzagatti, Alice"], "subjects": [], "titles": ["Additional file 2 of Replication stress generates distinctive landscapes of DNA copy number alterations and chromosome scale losses", "Additional file 5 of Replication stress generates distinctive landscapes of DNA copy number alterations and chromosome scale losses"], "ids": ["50|doi_dedup___::a1bfeb173971f74a274fab8bdd78a4bc", "50|doi_dedup___::3d6e151aaeb2f7c40a320207fdd80ade"], "title": "Analysis of DNA Copy Number Alterations and Chromosome Scale Losses", "summary": "This study analyzed the effects of replication stress on DNA copy number alterations and chromosome scale losses. The results show distinctive landscapes of these alterations and losses, which were further investigated in additional files. The study provides valuable insights into the mechanisms of replication stress and its impact on genomic stability.", "startDate": "2022-01-01", "endDate": "2022-01-01"} \ No newline at end of file From 6af3fd16b6362c4a9357f44ba779b7822f5b7033 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 5 Dec 2024 14:39:42 +0100 Subject: [PATCH 2/5] attributes fixes --- .../java/eu/dnetlib/pace/tree/DateRange.java | 90 ++--- .../eu/dnetlib/pace/tree/JsonListMatch.java | 2 +- .../pace/comparators/ComparatorTest.java | 30 +- .../dnetlib/dhp/actionmanager/Constants.java | 4 +- .../raid/GenerateRAiDActionSetJob.java | 309 ++++++++++-------- .../raid/model/GenerateRAiDActionSetJob.java | 5 +- .../actionmanager/raid/model/RAiDEntity.java | 154 ++++----- .../ror/GenerateRorActionSetJob.java | 2 +- .../dhp/sx/bio/pubmed/PMAffiliation.java | 44 +-- .../dnetlib/dhp/sx/bio/pubmed/PMAuthor.java | 1 - .../dhp/sx/bio/pubmed/PMIdentifier.java | 74 ++--- .../raid/oozie_app/action_set_parameters.json | 0 .../actionmanager/raid/oozie_app/workflow.xml | 2 - .../collection/crossref/Crossref2Oaf.scala | 1 - .../dnetlib/dhp/sx/bio/pubmed/PMParser2.scala | 13 +- .../dhp/sx/bio/pubmed/PubMedToOaf.scala | 20 +- .../raid/GenerateRAiDActionSetJobTest.java | 203 +++++++----- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 1 - .../dhp/bulktag/community/ResultTagger.java | 4 +- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 3 +- .../model/ProvisionModelSupport.java | 15 +- 21 files changed, 546 insertions(+), 431 deletions(-) delete mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java index c913109a48..194677e6eb 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java @@ -1,10 +1,5 @@ -package eu.dnetlib.pace.tree; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractStringComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.joda.time.DateTime; +package eu.dnetlib.pace.tree; import java.time.DateTimeException; import java.time.LocalDate; @@ -13,55 +8,62 @@ import java.time.format.DateTimeFormatter; import java.util.Locale; import java.util.Map; +import org.joda.time.DateTime; + +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("dateRange") public class DateRange extends AbstractStringComparator { - int YEAR_RANGE; + int YEAR_RANGE; - public DateRange(Map params) { - super(params, new com.wcohen.ss.JaroWinkler()); - YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3")); - } + public DateRange(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3")); + } - public DateRange(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public DateRange(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected DateRange(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected DateRange(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - public static boolean isNumeric(String str) { - return str.matches("\\d+"); //match a number with optional '-' and decimal. - } + public static boolean isNumeric(String str) { + return str.matches("\\d+"); // match a number with optional '-' and decimal. + } - @Override - public double distance(final String a, final String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) { - return -1.0; // return -1 if a field is missing - } + @Override + public double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1.0; // return -1 if a field is missing + } - try { - DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH); - LocalDate d1 = LocalDate.parse(a, formatter); - LocalDate d2 = LocalDate.parse(b, formatter); - Period period = Period.between(d1, d2); + try { + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH); + LocalDate d1 = LocalDate.parse(a, formatter); + LocalDate d2 = LocalDate.parse(b, formatter); + Period period = Period.between(d1, d2); - return period.getYears() <= YEAR_RANGE? 1.0 : 0.0; - } - catch (DateTimeException e) { - return -1.0; - } + return period.getYears() <= YEAR_RANGE ? 1.0 : 0.0; + } catch (DateTimeException e) { + return -1.0; + } - } + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(final double d) { - return d; - } + @Override + protected double normalize(final double d) { + return d; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java index e95d9206e2..d9558df90b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java @@ -62,7 +62,7 @@ public class JsonListMatch extends AbstractListComparator { Set types = Sets.intersection(typesA, typesB); - if (types.isEmpty()) // if no common type, it is impossible to compare + if (types.isEmpty()) // if no common type, it is impossible to compare return -1; ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet()); diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 83539de4a8..0abde84bc4 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -72,14 +72,34 @@ public class ComparatorTest extends AbstractPaceTest { CodeMatch codeMatch = new CodeMatch(params); // names have different codes - assertEquals(0.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ir02", conf)); + assertEquals( + 0.0, + codeMatch + .distance( + "physical oceanography at ctd station june 1998 ev02a", + "physical oceanography at ctd station june 1998 ir02", conf)); // names have same code - assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ev02a", conf)); + assertEquals( + 1.0, + codeMatch + .distance( + "physical oceanography at ctd station june 1998 ev02a", + "physical oceanography at ctd station june 1998 ev02a", conf)); // code is not in both names - assertEquals(-1, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998 ev02a", conf)); - assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", conf)); + assertEquals( + -1, + codeMatch + .distance( + "physical oceanography at ctd station june 1998", + "physical oceanography at ctd station june 1998 ev02a", conf)); + assertEquals( + 1.0, + codeMatch + .distance( + "physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", + conf)); } @Test @@ -275,7 +295,7 @@ public class ComparatorTest extends AbstractPaceTest { Arrays .asList( "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}", - "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"), + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"), "authors"); List b = createFieldList( Arrays diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index 722415c2ed..394cc22a35 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -3,8 +3,6 @@ package eu.dnetlib.dhp.actionmanager; import java.util.Optional; -import eu.dnetlib.dhp.schema.oaf.Instance; -import eu.dnetlib.dhp.schema.oaf.Qualifier; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -15,6 +13,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.Subject; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java index 8e5e1bdcb0..3b24059568 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java @@ -1,15 +1,15 @@ + package eu.dnetlib.dhp.actionmanager.raid; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.Constants; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; -import eu.dnetlib.dhp.utils.DHPUtils; +import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID; +import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; + +import java.util.*; +import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.SequenceFileOutputFormat; @@ -19,172 +19,191 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.Constants; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; -import java.util.*; -import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID; -import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; - public class GenerateRAiDActionSetJob { - private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class); + private static final Logger log = LoggerFactory + .getLogger(eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final List RAID_COLLECTED_FROM = listKeyValues( - OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); + private static final List RAID_COLLECTED_FROM = listKeyValues( + OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); - private static final Qualifier RAID_QUALIFIER = qualifier("raid:openaireinference", "raid:openaireinference", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); + private static final Qualifier RAID_QUALIFIER = qualifier("0049", "Research Activity Identifier", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE); - private static final DataInfo RAID_DATA_INFO = dataInfo( - false, OPENAIRE_DATASOURCE_NAME, true, false, RAID_QUALIFIER, "0.92"); + private static final Qualifier RAID_INFERENCE_QUALIFIER = qualifier( + "raid:openaireinference", "Inferred by OpenAIRE", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); - public static void main(final String[] args) throws Exception { + private static final DataInfo RAID_DATA_INFO = dataInfo( + false, OPENAIRE_DATASOURCE_NAME, true, false, RAID_INFERENCE_QUALIFIER, "0.92"); - final String jsonConfiguration = IOUtils - .toString( - eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class - .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json")); + public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final String jsonConfiguration = IOUtils + .toString( + eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class + .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json")); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - final Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + parser.parseArgument(args); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); - final String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String outputPath = parser.get("outputPath"); - log.info("outputPath {}: ", outputPath); + final String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - final SparkConf conf = new SparkConf(); + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); - runWithSparkSession(conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, outputPath); - processRAiDEntities(spark, inputPath, outputPath); - }); - } + final SparkConf conf = new SparkConf(); - private static void removeOutputDir(final SparkSession spark, final String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); + processRAiDEntities(spark, inputPath, outputPath); + }); + } - static void processRAiDEntities(final SparkSession spark, - final String inputPath, - final String outputPath) { - readInputPath(spark, inputPath) - .map(GenerateRAiDActionSetJob::prepareRAiD) - .flatMap(List::iterator) - .mapToPair( - aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), - new Text(OBJECT_MAPPER.writeValueAsString(aa)))) - .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + private static void removeOutputDir(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - } + static void processRAiDEntities(final SparkSession spark, + final String inputPath, + final String outputPath) { + readInputPath(spark, inputPath) + .map(GenerateRAiDActionSetJob::prepareRAiD) + .flatMap(List::iterator) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); - protected static List> prepareRAiD(final RAiDEntity r) { + } - final Date now = new Date(); - final OtherResearchProduct orp = new OtherResearchProduct(); - final List> res = new ArrayList<>(); - String raidId = calculateOpenaireId(r.getRaid()); + protected static List> prepareRAiD(final RAiDEntity r) { - orp.setId(raidId); - orp.setCollectedfrom(RAID_COLLECTED_FROM); - orp.setDataInfo(RAID_DATA_INFO); - orp.setResourcetype(RAID_QUALIFIER); - orp.setTitle( - Collections.singletonList( - structuredProperty( - r.getTitle(), - qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE), - RAID_DATA_INFO)) - ); - orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary())); - orp.setAuthor(createAuthors(r.getAuthors())); - orp.setInstance(Collections.singletonList(eu.dnetlib.dhp.actionmanager.Constants.getInstance(RAID_QUALIFIER))); - orp.setSubject( - r.getSubjects() - .stream() - .map(s -> subject(s, qualifier(DNET_SUBJECT_KEYWORD, DNET_SUBJECT_KEYWORD, DNET_SUBJECT_TYPOLOGIES, DNET_SUBJECT_TYPOLOGIES), RAID_DATA_INFO)) - .collect(Collectors.toList()) - ); - orp.setRelevantdate( - Arrays.asList( - structuredProperty(r.getEndDate(), qualifier("endDate","endDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO), - structuredProperty(r.getStartDate(), qualifier("startDate", "startDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO) - ) - ); - orp.setLastupdatetimestamp(now.getTime()); - orp.setDateofcollection(r.getStartDate()); + final Date now = new Date(); + final OtherResearchProduct orp = new OtherResearchProduct(); + final List> res = new ArrayList<>(); + String raidId = calculateOpenaireId(r.getRaid()); - res.add(new AtomicAction<>(OtherResearchProduct.class, orp)); + orp.setId(raidId); + orp.setCollectedfrom(RAID_COLLECTED_FROM); + orp.setDataInfo(RAID_DATA_INFO); + orp + .setTitle( + Collections + .singletonList( + structuredProperty( + r.getTitle(), + qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE), + RAID_DATA_INFO))); + orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary())); +// orp.setAuthor(createAuthors(r.getAuthors())); + orp.setInstance(Collections.singletonList(eu.dnetlib.dhp.actionmanager.Constants.getInstance(RAID_QUALIFIER))); + orp + .setSubject( + r + .getSubjects() + .stream() + .map( + s -> subject( + s, + qualifier( + DNET_SUBJECT_KEYWORD, DNET_SUBJECT_KEYWORD, DNET_SUBJECT_TYPOLOGIES, + DNET_SUBJECT_TYPOLOGIES), + RAID_DATA_INFO)) + .collect(Collectors.toList())); + orp + .setRelevantdate( + Arrays + .asList( + structuredProperty( + r.getEndDate(), qualifier("endDate", "endDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), + RAID_DATA_INFO), + structuredProperty( + r.getStartDate(), + qualifier("startDate", "startDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), + RAID_DATA_INFO))); + orp.setLastupdatetimestamp(now.getTime()); + orp.setDateofacceptance(field(r.getStartDate(), RAID_DATA_INFO)); - for(String resultId: r.getIds()) { - Relation rel1 = OafMapperUtils.getRelation( - raidId, - resultId, - ModelConstants.RESULT_RESULT, - ModelConstants.OUTCOME, - PART, - RAID_COLLECTED_FROM, - RAID_DATA_INFO, - now.getTime(), - null, - null - ); - Relation rel2 = OafMapperUtils.getRelation( - resultId, - raidId, - ModelConstants.RESULT_RESULT, - ModelConstants.OUTCOME, - IS_PART_OF, - RAID_COLLECTED_FROM, - RAID_DATA_INFO, - now.getTime(), - null, - null - ); - res.add(new AtomicAction<>(Relation.class, rel1)); - res.add(new AtomicAction<>(Relation.class, rel2)); - } + res.add(new AtomicAction<>(OtherResearchProduct.class, orp)); - return res; - } + for (String resultId : r.getIds()) { + Relation rel1 = OafMapperUtils + .getRelation( + raidId, + resultId, + ModelConstants.RESULT_RESULT, + PART, + HAS_PART, + RAID_COLLECTED_FROM, + RAID_DATA_INFO, + now.getTime(), + null, + null); + Relation rel2 = OafMapperUtils + .getRelation( + resultId, + raidId, + ModelConstants.RESULT_RESULT, + PART, + IS_PART_OF, + RAID_COLLECTED_FROM, + RAID_DATA_INFO, + now.getTime(), + null, + null); + res.add(new AtomicAction<>(Relation.class, rel1)); + res.add(new AtomicAction<>(Relation.class, rel2)); + } - public static String calculateOpenaireId(final String raid) { - return String.format("50|%s::%s", Constants.RAID_NS_PREFIX, DHPUtils.md5(raid)); - } + return res; + } - public static List createAuthors(final List author) { - return author.stream().map(s-> { - Author a = new Author(); - a.setFullname(s); - return a; - }).collect(Collectors.toList()); - } + public static String calculateOpenaireId(final String raid) { + return String.format("50|%s::%s", Constants.RAID_NS_PREFIX, DHPUtils.md5(raid)); + } - private static JavaRDD readInputPath( - final SparkSession spark, - final String path) { + public static List createAuthors(final List author) { + return author.stream().map(s -> { + Author a = new Author(); + a.setFullname(s); + return a; + }).collect(Collectors.toList()); + } - return spark - .read() - .json(path) - .as(Encoders.bean(RAiDEntity.class)) - .toJavaRDD(); + private static JavaRDD readInputPath( + final SparkSession spark, + final String path) { - } + return spark + .read() + .json(path) + .as(Encoders.bean(RAiDEntity.class)) + .toJavaRDD(); + + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java index b0aec71d3f..856b52e181 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java @@ -1,2 +1,5 @@ -package eu.dnetlib.dhp.actionmanager.raid.model;public class GenerateRAiDActionSetJob { + +package eu.dnetlib.dhp.actionmanager.raid.model; + +public class GenerateRAiDActionSetJob { } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java index bd7e28926e..1203b28a76 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.actionmanager.raid.model; import java.io.Serializable; @@ -5,98 +6,101 @@ import java.util.List; public class RAiDEntity implements Serializable { - String raid; - List authors; - String startDate; - String endDate; - List subjects; - List titles; - List ids; - String title; - String summary; + String raid; + List authors; + String startDate; + String endDate; + List subjects; + List titles; + List ids; + String title; + String summary; - public RAiDEntity(){} - public RAiDEntity(String raid, List authors, String startDate, String endDate, List subjects, List titles, List ids, String title, String summary) { - this.raid = raid; - this.authors = authors; - this.startDate = startDate; - this.endDate = endDate; - this.subjects = subjects; - this.titles = titles; - this.ids = ids; - this.title = title; - this.summary = summary; - } + public RAiDEntity() { + } - public String getRaid() { - return raid; - } + public RAiDEntity(String raid, List authors, String startDate, String endDate, List subjects, + List titles, List ids, String title, String summary) { + this.raid = raid; + this.authors = authors; + this.startDate = startDate; + this.endDate = endDate; + this.subjects = subjects; + this.titles = titles; + this.ids = ids; + this.title = title; + this.summary = summary; + } - public void setRaid(String raid) { - this.raid = raid; - } + public String getRaid() { + return raid; + } - public List getAuthors() { - return authors; - } + public void setRaid(String raid) { + this.raid = raid; + } - public void setAuthors(List authors) { - this.authors = authors; - } + public List getAuthors() { + return authors; + } - public String getStartDate() { - return startDate; - } + public void setAuthors(List authors) { + this.authors = authors; + } - public void setStartDate(String startDate) { - this.startDate = startDate; - } + public String getStartDate() { + return startDate; + } - public String getEndDate() { - return endDate; - } + public void setStartDate(String startDate) { + this.startDate = startDate; + } - public void setEndDate(String endDate) { - this.endDate = endDate; - } + public String getEndDate() { + return endDate; + } - public List getSubjects() { - return subjects; - } + public void setEndDate(String endDate) { + this.endDate = endDate; + } - public void setSubjects(List subjects) { - this.subjects = subjects; - } + public List getSubjects() { + return subjects; + } - public List getTitles() { - return titles; - } + public void setSubjects(List subjects) { + this.subjects = subjects; + } - public void setTitles(List titles) { - this.titles = titles; - } + public List getTitles() { + return titles; + } - public List getIds() { - return ids; - } + public void setTitles(List titles) { + this.titles = titles; + } - public void setIds(List ids) { - this.ids = ids; - } + public List getIds() { + return ids; + } - public String getTitle() { - return title; - } + public void setIds(List ids) { + this.ids = ids; + } - public void setTitle(String title) { - this.title = title; - } + public String getTitle() { + return title; + } - public String getSummary() { - return summary; - } + public void setTitle(String title) { + this.title = title; + } - public void setSummary(String summary) { - this.summary = summary; - } + public String getSummary() { + return summary; + } + + public void setSummary(String summary) { + this.summary = summary; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java index ce1973a7f5..6e8f48bda2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java @@ -21,7 +21,6 @@ import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.oaf.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -45,6 +44,7 @@ import eu.dnetlib.dhp.common.Constants; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java index a8dacd1323..5ac1920eaa 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.sx.bio.pubmed; /** @@ -7,32 +8,33 @@ package eu.dnetlib.dhp.sx.bio.pubmed; */ public class PMAffiliation { - private String name; + private String name; - private PMIdentifier identifier; + private PMIdentifier identifier; - public PMAffiliation() { + public PMAffiliation() { - } - public PMAffiliation(String name, PMIdentifier identifier) { - this.name = name; - this.identifier = identifier; - } + } - public String getName() { - return name; - } + public PMAffiliation(String name, PMIdentifier identifier) { + this.name = name; + this.identifier = identifier; + } - public void setName(String name) { - this.name = name; - } + public String getName() { + return name; + } - public PMIdentifier getIdentifier() { - return identifier; - } + public void setName(String name) { + this.name = name; + } - public PMAffiliation setIdentifier(PMIdentifier identifier) { - this.identifier = identifier; - return this; - } + public PMIdentifier getIdentifier() { + return identifier; + } + + public PMAffiliation setIdentifier(PMIdentifier identifier) { + this.identifier = identifier; + return this; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java index b0df256634..e023f2e626 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java @@ -97,5 +97,4 @@ public class PMAuthor implements Serializable { this.affiliation = affiliation; } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java index 0c8c55e40e..6cd17a90c0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java @@ -1,53 +1,53 @@ + package eu.dnetlib.dhp.sx.bio.pubmed; public class PMIdentifier { - private String pid; - private String type; + private String pid; + private String type; + public PMIdentifier(String pid, String type) { + this.pid = cleanPid(pid); + this.type = type; + } - public PMIdentifier(String pid, String type) { - this.pid = cleanPid(pid); - this.type = type; - } + public PMIdentifier() { - public PMIdentifier() { + } - } + private String cleanPid(String pid) { - private String cleanPid(String pid) { + if (pid == null) { + return null; + } - if (pid == null) { - return null; - } + // clean ORCID ID in the form 0000000163025705 to 0000-0001-6302-5705 + if (pid.matches("[0-9]{15}[0-9X]")) { + return pid.replaceAll("(.{4})(.{4})(.{4})(.{4})", "$1-$2-$3-$4"); + } - // clean ORCID ID in the form 0000000163025705 to 0000-0001-6302-5705 - if (pid.matches("[0-9]{15}[0-9X]")) { - return pid.replaceAll("(.{4})(.{4})(.{4})(.{4})", "$1-$2-$3-$4"); - } + // clean ORCID in the form http://orcid.org/0000-0001-8567-3543 to 0000-0001-8567-3543 + if (pid.matches("http://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}")) { + return pid.replaceAll("http://orcid.org/", ""); + } + return pid; + } - // clean ORCID in the form http://orcid.org/0000-0001-8567-3543 to 0000-0001-8567-3543 - if (pid.matches("http://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}")) { - return pid.replaceAll("http://orcid.org/", ""); - } - return pid; - } + public String getPid() { + return pid; + } - public String getPid() { - return pid; - } + public PMIdentifier setPid(String pid) { + this.pid = cleanPid(pid); + return this; + } - public PMIdentifier setPid(String pid) { - this.pid = cleanPid(pid); - return this; - } + public String getType() { + return type; + } - public String getType() { - return type; - } - - public PMIdentifier setType(String type) { - this.type = type; - return this; - } + public PMIdentifier setType(String type) { + this.type = type; + return this; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml index 9b5aa5905e..d3392596fd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml @@ -20,8 +20,6 @@ - - diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index e4a238c8f3..de68ebb58e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -673,7 +673,6 @@ case object Crossref2Oaf { val doi = input.getString(0) val rorId = input.getString(1) - val pubId = IdentifierFactory.idFromPid("50", "doi", DoiCleaningRule.clean(doi), true) val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala index 2eb4bea65c..bc9a2cf02b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala @@ -82,21 +82,22 @@ class PMParser2 { a.setLastName((author \ "LastName").text) a.setForeName((author \ "ForeName").text) val id = (author \ "Identifier").text - val idType =(author \ "Identifier" \ "@Source").text + val idType = (author \ "Identifier" \ "@Source").text - if(id != null && id.nonEmpty && idType != null && idType.nonEmpty) { + if (id != null && id.nonEmpty && idType != null && idType.nonEmpty) { a.setIdentifier(new PMIdentifier(id, idType)) } - val affiliation = (author \ "AffiliationInfo" \ "Affiliation").text - val affiliationId = (author \ "AffiliationInfo" \ "Identifier").text + val affiliationId = (author \ "AffiliationInfo" \ "Identifier").text val affiliationIdType = (author \ "AffiliationInfo" \ "Identifier" \ "@Source").text - if(affiliation != null && affiliation.nonEmpty) { + if (affiliation != null && affiliation.nonEmpty) { val aff = new PMAffiliation() aff.setName(affiliation) - if(affiliationId != null && affiliationId.nonEmpty && affiliationIdType != null && affiliationIdType.nonEmpty) { + if ( + affiliationId != null && affiliationId.nonEmpty && affiliationIdType != null && affiliationIdType.nonEmpty + ) { aff.setIdentifier(new PMIdentifier(affiliationId, affiliationIdType)) } a.setAffiliation(aff) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index 5e14c731a6..281ca0e07d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -294,11 +294,23 @@ object PubMedToOaf { author.setName(a.getForeName) author.setSurname(a.getLastName) author.setFullname(a.getFullName) - if(a.getIdentifier != null) { - author.setPid(List(OafMapperUtils.structuredProperty(a.getIdentifier.getPid, - OafMapperUtils.qualifier(a.getIdentifier.getType,a.getIdentifier.getType,ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES), dataInfo)).asJava) + if (a.getIdentifier != null) { + author.setPid( + List( + OafMapperUtils.structuredProperty( + a.getIdentifier.getPid, + OafMapperUtils.qualifier( + a.getIdentifier.getType, + a.getIdentifier.getType, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES + ), + dataInfo + ) + ).asJava + ) } - if (a.getAffiliation!= null) + if (a.getAffiliation != null) author.setRawAffiliationString(List(a.getAffiliation.getName).asJava) author.setRank(index + 1) author diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java index 1f33f45b2b..9417822af8 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java @@ -1,11 +1,16 @@ + package eu.dnetlib.dhp.actionmanager.raid; -import eu.dnetlib.dhp.actionmanager.opencitations.CreateOpenCitationsASTest; -import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; -import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; -import eu.dnetlib.dhp.schema.oaf.Relation; +import static java.nio.file.Files.createTempDirectory; + +import static eu.dnetlib.dhp.actionmanager.Constants.OBJECT_MAPPER; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.File; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; + import org.apache.commons.io.FileUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; @@ -20,93 +25,141 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.actionmanager.opencitations.CreateOpenCitationsASTest; +import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; -import java.io.File; -import java.nio.file.Paths; -import java.util.Arrays; -import java.util.List; - -import static eu.dnetlib.dhp.actionmanager.Constants.OBJECT_MAPPER; -import static java.nio.file.Files.createTempDirectory; -import static org.junit.jupiter.api.Assertions.assertEquals; - public class GenerateRAiDActionSetJobTest { - private static String input_path; - private static String output_path; - static SparkSession spark; + private static String input_path; + private static String output_path; + static SparkSession spark; - @BeforeEach - void setUp() throws Exception { + @BeforeEach + void setUp() throws Exception { - input_path = Paths - .get(GenerateRAiDActionSetJobTest.class.getResource("/eu/dnetlib/dhp/actionmanager/raid/raid_example.json").toURI()) - .toFile() - .getAbsolutePath(); + input_path = Paths + .get( + GenerateRAiDActionSetJobTest.class + .getResource("/eu/dnetlib/dhp/actionmanager/raid/raid_example.json") + .toURI()) + .toFile() + .getAbsolutePath(); - output_path = createTempDirectory(GenerateRAiDActionSetJobTest.class.getSimpleName() + "-") - .toAbsolutePath() - .toString(); + output_path = createTempDirectory(GenerateRAiDActionSetJobTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); - SparkConf conf = new SparkConf(); - conf.setAppName(GenerateRAiDActionSetJobTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(GenerateRAiDActionSetJobTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", output_path); - conf.set("hive.metastore.warehouse.dir", output_path); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", output_path); + conf.set("hive.metastore.warehouse.dir", output_path); - spark = SparkSession - .builder() - .appName(GenerateRAiDActionSetJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(GenerateRAiDActionSetJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - static void cleanUp() throws Exception { - FileUtils.deleteDirectory(new File(output_path)); - } + @AfterAll + static void cleanUp() throws Exception { + FileUtils.deleteDirectory(new File(output_path)); + } - @Test - @Disabled - void testProcessRAiDEntities() { - GenerateRAiDActionSetJob.processRAiDEntities(spark, input_path, output_path + "/test_raid_action_set"); + @Test + @Disabled + void testProcessRAiDEntities() { + GenerateRAiDActionSetJob.processRAiDEntities(spark, input_path, output_path + "/test_raid_action_set"); - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD result = sc - .sequenceFile(output_path + "/test_raid_action_set", Text.class, Text.class) - .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) - .map(AtomicAction::getPayload); + JavaRDD result = sc + .sequenceFile(output_path + "/test_raid_action_set", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(AtomicAction::getPayload); - assertEquals(80, result.count()); - } + assertEquals(80, result.count()); + } - @Test - void testPrepareRAiD() { + @Test + void testPrepareRAiD() { - List> atomicActions = GenerateRAiDActionSetJob.prepareRAiD(new RAiDEntity( - "-92190526", - Arrays.asList("Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura", "Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume", "Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont", "Maïeul GRUGET", "Cécile Duchêne"), - "2021-09-10", - "2024-02-16", - Arrays.asList("cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps", "pan-scalar map", "Python library", "QGIS", "map design", "landmarks", "Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]", "[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography", "eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency", "General Medicine", "Geography, Planning and Development", "multi-scales", "pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences", "progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design", "cartography, map generalisation, zoom, multi-scale map", "Interactive maps", "Map generalisation", "Earth and Planetary Sciences (miscellaneous)", "Cartographic generalization", "rivers", "Benchmark", "General Environmental Science", "open source", "drawing", "Constraint", "Multi-scale maps"), - Arrays.asList("Where do people look at during multi-scale map tasks?", "FogDetector survey raw data", "Collection of cartographic disorientation stories", "Anchorwhat dataset", "BasqueRoads: A Benchmark for Road Network Selection", "Progressive river network selection for pan-scalar maps", "BasqueRoads, a dataset to benchmark road selection algorithms", "Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps", "Empirical approach to advance the generalisation of multi-scale maps", "L'Alpe d'Huez: a dataset to benchmark topographic map generalisation", "eye-tracking data from a survey on zooming in a pan-scalar map", "Material of the experiment 'More is Less' from the MapMuxing project", "Cartagen4py, an open source Python library for map generalisation", "L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"), - Arrays.asList("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", "50|doi_dedup___::754e3c283639bc6e104c925ff3e34007", "50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0", "50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a", "50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153", "50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a", "50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13", "50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4", "50|doi_dedup___::a9bc4453273b2d02648a5cb453195042", "50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7", "50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5", "50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283", "50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea", "50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"), - "Exploring Multi-Scale Map Generalization and Design", - "This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval." - )); + List> atomicActions = GenerateRAiDActionSetJob + .prepareRAiD( + new RAiDEntity( + "-92190526", + Arrays + .asList( + "Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura", + "Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume", + "Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont", + "Maïeul GRUGET", "Cécile Duchêne"), + "2021-09-10", + "2024-02-16", + Arrays + .asList( + "cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps", + "pan-scalar map", "Python library", "QGIS", "map design", "landmarks", + "Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]", + "[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography", + "eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency", + "General Medicine", "Geography, Planning and Development", "multi-scales", + "pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences", + "progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design", + "cartography, map generalisation, zoom, multi-scale map", "Interactive maps", + "Map generalisation", "Earth and Planetary Sciences (miscellaneous)", + "Cartographic generalization", "rivers", "Benchmark", "General Environmental Science", + "open source", "drawing", "Constraint", "Multi-scale maps"), + Arrays + .asList( + "Where do people look at during multi-scale map tasks?", "FogDetector survey raw data", + "Collection of cartographic disorientation stories", "Anchorwhat dataset", + "BasqueRoads: A Benchmark for Road Network Selection", + "Progressive river network selection for pan-scalar maps", + "BasqueRoads, a dataset to benchmark road selection algorithms", + "Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps", + "Empirical approach to advance the generalisation of multi-scale maps", + "L'Alpe d'Huez: a dataset to benchmark topographic map generalisation", + "eye-tracking data from a survey on zooming in a pan-scalar map", + "Material of the experiment 'More is Less' from the MapMuxing project", + "Cartagen4py, an open source Python library for map generalisation", + "L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"), + Arrays + .asList( + "50|doi_dedup___::6915135e0aa39f913394513f809ae58a", + "50|doi_dedup___::754e3c283639bc6e104c925ff3e34007", + "50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0", + "50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a", + "50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153", + "50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a", + "50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13", + "50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4", + "50|doi_dedup___::a9bc4453273b2d02648a5cb453195042", + "50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7", + "50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5", + "50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283", + "50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea", + "50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"), + "Exploring Multi-Scale Map Generalization and Design", + "This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval.")); - OtherResearchProduct orp = (OtherResearchProduct) atomicActions.get(0).getPayload(); - Relation rel = (Relation) atomicActions.get(1).getPayload(); + OtherResearchProduct orp = (OtherResearchProduct) atomicActions.get(0).getPayload(); + Relation rel = (Relation) atomicActions.get(1).getPayload(); - assertEquals("Exploring Multi-Scale Map Generalization and Design", orp.getTitle().get(0).getValue()); - assertEquals("50|raid________::759a564ce5cc7360cab030c517c7366b", rel.getSource()); - assertEquals("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", rel.getTarget()); + assertEquals("Exploring Multi-Scale Map Generalization and Design", orp.getTitle().get(0).getValue()); + assertEquals("50|raid________::759a564ce5cc7360cab030c517c7366b", rel.getSource()); + assertEquals("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", rel.getTarget()); - } + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index 4a926df015..cb7826dbff 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -63,7 +63,6 @@ class BioScholixTest extends AbstractVocabularyTest { "0000000333457333", "0000000335964515", "0000000302921949", - "http://orcid.org/0000-0001-8567-3543", "http://orcid.org/0000-0001-7868-8528", "0000-0001-9189-1440", diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java index 64cbd70bac..0d6c816278 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java @@ -130,7 +130,7 @@ public class ResultTagger implements Serializable { // log.info("Remove constraints for " + communityId); if (conf.getRemoveConstraintsMap().keySet().contains(communityId) && conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null && - !conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() && + !conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() && conf .getRemoveConstraintsMap() .get(communityId) @@ -228,7 +228,7 @@ public class ResultTagger implements Serializable { .forEach(communityId -> { if (!removeCommunities.contains(communityId) && conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null && - !conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() && + !conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() && conf .getSelectionConstraintsMap() .get(communityId) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index ea9503d179..e1710db547 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -915,7 +915,8 @@ class MappersTest { @Test void testODFRecord_guidelines4() throws IOException { - final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_guidelines4.xml"))); + final String xml = IOUtils + .toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_guidelines4.xml"))); final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final Publication p = (Publication) list.get(0); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 0da0f6955c..2c977a390b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -5,7 +5,6 @@ import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.solr.PersonTopic; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -40,6 +39,7 @@ import eu.dnetlib.dhp.schema.solr.OpenAccessColor; import eu.dnetlib.dhp.schema.solr.OpenAccessRoute; import eu.dnetlib.dhp.schema.solr.Organization; import eu.dnetlib.dhp.schema.solr.Person; +import eu.dnetlib.dhp.schema.solr.PersonTopic; import eu.dnetlib.dhp.schema.solr.Pid; import eu.dnetlib.dhp.schema.solr.Project; import eu.dnetlib.dhp.schema.solr.Result; @@ -216,11 +216,14 @@ public class ProvisionModelSupport { } private static List mapPersonTopics(List subjects) { - return Optional.ofNullable(subjects) - .map(ss -> ss.stream() - .map(ProvisionModelSupport::mapPersonTopic) - .collect(Collectors.toList())) - .orElse(null); + return Optional + .ofNullable(subjects) + .map( + ss -> ss + .stream() + .map(ProvisionModelSupport::mapPersonTopic) + .collect(Collectors.toList())) + .orElse(null); } private static PersonTopic mapPersonTopic(eu.dnetlib.dhp.schema.oaf.PersonTopic pt) { From 1c144a4dcb951319ed88a7cac4825837c5385316 Mon Sep 17 00:00:00 2001 From: miconis Date: Fri, 6 Dec 2024 09:18:10 +0100 Subject: [PATCH 3/5] minor change --- .../dhp/actionmanager/raid/GenerateRAiDActionSetJob.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java index 3b24059568..c82934cdb3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java @@ -43,7 +43,8 @@ public class GenerateRAiDActionSetJob { private static final List RAID_COLLECTED_FROM = listKeyValues( OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); - private static final Qualifier RAID_QUALIFIER = qualifier("0049", "Research Activity Identifier", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE); + private static final Qualifier RAID_QUALIFIER = qualifier( + "0049", "Research Activity Identifier", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE); private static final Qualifier RAID_INFERENCE_QUALIFIER = qualifier( "raid:openaireinference", "Inferred by OpenAIRE", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); From dade7d5bb86d4030fc2b69f3a26940e055e216eb Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 6 Dec 2024 10:02:07 +0100 Subject: [PATCH 4/5] minor changes --- .../java/eu/dnetlib/dhp/common/Constants.java | 5 ++-- .../raid/GenerateRAiDActionSetJob.java | 27 +++++++------------ 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java index b00199ea54..6a4bb34d3f 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java @@ -11,8 +11,9 @@ public class Constants { public static final Map coarCodeLabelMap = Maps.newHashMap(); public static final String RAID_NS_PREFIX = "raid________"; - public static final String RAID_DATASOURCE_NAME = "Research Activity Identifier Service (RAiD)"; - public static final String RAID_OPENAIRE_ID = ""; + + public static final String END_DATE = "endDate"; + public static final String START_DATE = "startDate"; public static final String ROR_NS_PREFIX = "ror_________"; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java index c82934cdb3..e67e7171fb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.actionmanager.raid; import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID; import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME; +import static eu.dnetlib.dhp.common.Constants.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; @@ -24,7 +25,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.Constants; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -120,8 +120,10 @@ public class GenerateRAiDActionSetJob { qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE), RAID_DATA_INFO))); orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary())); -// orp.setAuthor(createAuthors(r.getAuthors())); - orp.setInstance(Collections.singletonList(eu.dnetlib.dhp.actionmanager.Constants.getInstance(RAID_QUALIFIER))); + + Instance instance = new Instance(); + instance.setInstancetype(RAID_QUALIFIER); + orp.setInstance(Collections.singletonList(instance)); orp .setSubject( r @@ -140,11 +142,11 @@ public class GenerateRAiDActionSetJob { Arrays .asList( structuredProperty( - r.getEndDate(), qualifier("endDate", "endDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), + r.getEndDate(), qualifier(END_DATE, END_DATE, DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO), structuredProperty( r.getStartDate(), - qualifier("startDate", "startDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), + qualifier(START_DATE, START_DATE, DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO))); orp.setLastupdatetimestamp(now.getTime()); orp.setDateofacceptance(field(r.getStartDate(), RAID_DATA_INFO)); @@ -159,11 +161,7 @@ public class GenerateRAiDActionSetJob { ModelConstants.RESULT_RESULT, PART, HAS_PART, - RAID_COLLECTED_FROM, - RAID_DATA_INFO, - now.getTime(), - null, - null); + orp); Relation rel2 = OafMapperUtils .getRelation( resultId, @@ -171,11 +169,7 @@ public class GenerateRAiDActionSetJob { ModelConstants.RESULT_RESULT, PART, IS_PART_OF, - RAID_COLLECTED_FROM, - RAID_DATA_INFO, - now.getTime(), - null, - null); + orp); res.add(new AtomicAction<>(Relation.class, rel1)); res.add(new AtomicAction<>(Relation.class, rel2)); } @@ -184,7 +178,7 @@ public class GenerateRAiDActionSetJob { } public static String calculateOpenaireId(final String raid) { - return String.format("50|%s::%s", Constants.RAID_NS_PREFIX, DHPUtils.md5(raid)); + return String.format("50|%s::%s", RAID_NS_PREFIX, DHPUtils.md5(raid)); } public static List createAuthors(final List author) { @@ -204,7 +198,6 @@ public class GenerateRAiDActionSetJob { .json(path) .as(Encoders.bean(RAiDEntity.class)) .toJavaRDD(); - } } From 8a5ba8df45d6fb1b570853307fb99f465d0667f5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 6 Dec 2024 10:03:11 +0100 Subject: [PATCH 5/5] minor changes --- .../main/java/eu/dnetlib/dhp/actionmanager/Constants.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index 394cc22a35..d7ad7fcb95 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -112,12 +112,6 @@ public class Constants { } - public static Instance getInstance(Qualifier qualifier) { - Instance instance = new Instance(); - instance.setInstancetype(qualifier); - return instance; - } - public static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); }