1
0
Fork 0

Merge remote-tracking branch 'origin/beta' into beta

This commit is contained in:
Sandro La Bruzzo 2024-12-06 14:23:58 +01:00
commit dd6ed31383
26 changed files with 1259 additions and 82 deletions

View File

@ -10,6 +10,11 @@ public class Constants {
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap(); public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap(); public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
public static final String RAID_NS_PREFIX = "raid________";
public static final String END_DATE = "endDate";
public static final String START_DATE = "startDate";
public static final String ROR_NS_PREFIX = "ror_________"; public static final String ROR_NS_PREFIX = "ror_________";
public static final String ROR_OPENAIRE_ID = "10|openaire____::993a7ae7a863813cf95028b50708e222"; public static final String ROR_OPENAIRE_ID = "10|openaire____::993a7ae7a863813cf95028b50708e222";

View File

@ -13,6 +13,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject; import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;

View File

@ -0,0 +1,203 @@
package eu.dnetlib.dhp.actionmanager.raid;
import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID;
import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME;
import static eu.dnetlib.dhp.common.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
public class GenerateRAiDActionSetJob {
private static final Logger log = LoggerFactory
.getLogger(eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final List<KeyValue> RAID_COLLECTED_FROM = listKeyValues(
OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
private static final Qualifier RAID_QUALIFIER = qualifier(
"0049", "Research Activity Identifier", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE);
private static final Qualifier RAID_INFERENCE_QUALIFIER = qualifier(
"raid:openaireinference", "Inferred by OpenAIRE", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS);
private static final DataInfo RAID_DATA_INFO = dataInfo(
false, OPENAIRE_DATASOURCE_NAME, true, false, RAID_INFERENCE_QUALIFIER, "0.92");
public static void main(final String[] args) throws Exception {
final String jsonConfiguration = IOUtils
.toString(
eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, outputPath);
processRAiDEntities(spark, inputPath, outputPath);
});
}
private static void removeOutputDir(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
static void processRAiDEntities(final SparkSession spark,
final String inputPath,
final String outputPath) {
readInputPath(spark, inputPath)
.map(GenerateRAiDActionSetJob::prepareRAiD)
.flatMap(List::iterator)
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
protected static List<AtomicAction<? extends Oaf>> prepareRAiD(final RAiDEntity r) {
final Date now = new Date();
final OtherResearchProduct orp = new OtherResearchProduct();
final List<AtomicAction<? extends Oaf>> res = new ArrayList<>();
String raidId = calculateOpenaireId(r.getRaid());
orp.setId(raidId);
orp.setCollectedfrom(RAID_COLLECTED_FROM);
orp.setDataInfo(RAID_DATA_INFO);
orp
.setTitle(
Collections
.singletonList(
structuredProperty(
r.getTitle(),
qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE),
RAID_DATA_INFO)));
orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary()));
Instance instance = new Instance();
instance.setInstancetype(RAID_QUALIFIER);
orp.setInstance(Collections.singletonList(instance));
orp
.setSubject(
r
.getSubjects()
.stream()
.map(
s -> subject(
s,
qualifier(
DNET_SUBJECT_KEYWORD, DNET_SUBJECT_KEYWORD, DNET_SUBJECT_TYPOLOGIES,
DNET_SUBJECT_TYPOLOGIES),
RAID_DATA_INFO))
.collect(Collectors.toList()));
orp
.setRelevantdate(
Arrays
.asList(
structuredProperty(
r.getEndDate(), qualifier(END_DATE, END_DATE, DNET_DATACITE_DATE, DNET_DATACITE_DATE),
RAID_DATA_INFO),
structuredProperty(
r.getStartDate(),
qualifier(START_DATE, START_DATE, DNET_DATACITE_DATE, DNET_DATACITE_DATE),
RAID_DATA_INFO)));
orp.setLastupdatetimestamp(now.getTime());
orp.setDateofacceptance(field(r.getStartDate(), RAID_DATA_INFO));
res.add(new AtomicAction<>(OtherResearchProduct.class, orp));
for (String resultId : r.getIds()) {
Relation rel1 = OafMapperUtils
.getRelation(
raidId,
resultId,
ModelConstants.RESULT_RESULT,
PART,
HAS_PART,
orp);
Relation rel2 = OafMapperUtils
.getRelation(
resultId,
raidId,
ModelConstants.RESULT_RESULT,
PART,
IS_PART_OF,
orp);
res.add(new AtomicAction<>(Relation.class, rel1));
res.add(new AtomicAction<>(Relation.class, rel2));
}
return res;
}
public static String calculateOpenaireId(final String raid) {
return String.format("50|%s::%s", RAID_NS_PREFIX, DHPUtils.md5(raid));
}
public static List<Author> createAuthors(final List<String> author) {
return author.stream().map(s -> {
Author a = new Author();
a.setFullname(s);
return a;
}).collect(Collectors.toList());
}
private static JavaRDD<RAiDEntity> readInputPath(
final SparkSession spark,
final String path) {
return spark
.read()
.json(path)
.as(Encoders.bean(RAiDEntity.class))
.toJavaRDD();
}
}

View File

@ -0,0 +1,5 @@
package eu.dnetlib.dhp.actionmanager.raid.model;
public class GenerateRAiDActionSetJob {
}

View File

@ -0,0 +1,106 @@
package eu.dnetlib.dhp.actionmanager.raid.model;
import java.io.Serializable;
import java.util.List;
public class RAiDEntity implements Serializable {
String raid;
List<String> authors;
String startDate;
String endDate;
List<String> subjects;
List<String> titles;
List<String> ids;
String title;
String summary;
public RAiDEntity() {
}
public RAiDEntity(String raid, List<String> authors, String startDate, String endDate, List<String> subjects,
List<String> titles, List<String> ids, String title, String summary) {
this.raid = raid;
this.authors = authors;
this.startDate = startDate;
this.endDate = endDate;
this.subjects = subjects;
this.titles = titles;
this.ids = ids;
this.title = title;
this.summary = summary;
}
public String getRaid() {
return raid;
}
public void setRaid(String raid) {
this.raid = raid;
}
public List<String> getAuthors() {
return authors;
}
public void setAuthors(List<String> authors) {
this.authors = authors;
}
public String getStartDate() {
return startDate;
}
public void setStartDate(String startDate) {
this.startDate = startDate;
}
public String getEndDate() {
return endDate;
}
public void setEndDate(String endDate) {
this.endDate = endDate;
}
public List<String> getSubjects() {
return subjects;
}
public void setSubjects(List<String> subjects) {
this.subjects = subjects;
}
public List<String> getTitles() {
return titles;
}
public void setTitles(List<String> titles) {
this.titles = titles;
}
public List<String> getIds() {
return ids;
}
public void setIds(List<String> ids) {
this.ids = ids;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
}

View File

@ -44,13 +44,7 @@ import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2; import scala.Tuple2;

View File

@ -28,6 +28,7 @@ import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.osf.OsfPreprintsCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.osf.OsfPreprintsCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.zenodo.CollectZenodoDumpCollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.common.collection.HttpClientParams;
@ -129,6 +130,8 @@ public class CollectorWorker extends ReportingJob {
return new Gtr2PublicationsCollectorPlugin(this.clientParams); return new Gtr2PublicationsCollectorPlugin(this.clientParams);
case osfPreprints: case osfPreprints:
return new OsfPreprintsCollectorPlugin(this.clientParams); return new OsfPreprintsCollectorPlugin(this.clientParams);
case zenodoDump:
return new CollectZenodoDumpCollectorPlugin();
case other: case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(this.api.getParams().get("other_plugin_type")) .ofNullable(this.api.getParams().get("other_plugin_type"))

View File

@ -11,7 +11,7 @@ public interface CollectorPlugin {
enum NAME { enum NAME {
oai, other, rest_json2xml, file, fileGzip, baseDump, gtr2Publications, osfPreprints; oai, other, rest_json2xml, file, fileGzip, baseDump, gtr2Publications, osfPreprints, zenodoDump;
public enum OTHER_NAME { public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb mdstore_mongodb_dump, mdstore_mongodb

View File

@ -0,0 +1,96 @@
package eu.dnetlib.dhp.collection.plugin.zenodo;
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
import java.io.IOException;
import java.io.InputStream;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class CollectZenodoDumpCollectorPlugin implements CollectorPlugin {
final private Logger log = LoggerFactory.getLogger(getClass());
private void downloadItem(final String name, final String itemURL, final String basePath,
final FileSystem fileSystem) {
try {
final Path hdfsWritePath = new Path(String.format("%s/%s", basePath, name));
final FSDataOutputStream fsDataOutputStream = fileSystem.create(hdfsWritePath, true);
final HttpGet request = new HttpGet(itemURL);
final int timeout = 60; // seconds
final RequestConfig config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000)
.build();
log.info("Downloading url {} into {}", itemURL, hdfsWritePath.getName());
try (CloseableHttpClient client = HttpClientBuilder.create().setDefaultRequestConfig(config).build();
CloseableHttpResponse response = client.execute(request)) {
int responseCode = response.getStatusLine().getStatusCode();
log.info("Response code is {}", responseCode);
if (responseCode >= 200 && responseCode < 400) {
IOUtils.copy(response.getEntity().getContent(), fsDataOutputStream);
}
} catch (Throwable eu) {
throw new RuntimeException(eu);
}
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
@Override
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
try {
final String zenodoURL = api.getBaseUrl();
final String hdfsURI = api.getParams().get("hdfsURI");
final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsURI));
downloadItem("zenodoDump.tar.gz", zenodoURL, "/tmp", fileSystem);
CompressionCodecFactory factory = new CompressionCodecFactory(fileSystem.getConf());
Path sourcePath = new Path("/tmp/zenodoDump.tar.gz");
CompressionCodec codec = factory.getCodec(sourcePath);
InputStream gzipInputStream = null;
try {
gzipInputStream = codec.createInputStream(fileSystem.open(sourcePath));
return iterateTar(gzipInputStream);
} catch (IOException e) {
throw new CollectorException(e);
} finally {
log.info("Closing gzip stream");
org.apache.hadoop.io.IOUtils.closeStream(gzipInputStream);
}
} catch (Exception e) {
throw new CollectorException(e);
}
}
private Stream<String> iterateTar(InputStream gzipInputStream) throws Exception {
Iterable<String> iterable = () -> new ZenodoTarIterator(gzipInputStream);
return StreamSupport.stream(iterable.spliterator(), false);
}
}

View File

@ -0,0 +1,59 @@
package eu.dnetlib.dhp.collection.plugin.zenodo;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Iterator;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.io.IOUtils;
public class ZenodoTarIterator implements Iterator<String>, Closeable {
private final InputStream gzipInputStream;
private final StringBuilder currentItem = new StringBuilder();
private TarArchiveInputStream tais;
private boolean hasNext;
public ZenodoTarIterator(InputStream gzipInputStream) {
this.gzipInputStream = gzipInputStream;
tais = new TarArchiveInputStream(gzipInputStream);
hasNext = getNextItem();
}
private boolean getNextItem() {
try {
TarArchiveEntry entry;
while ((entry = tais.getNextTarEntry()) != null) {
if (entry.isFile()) {
currentItem.setLength(0);
currentItem.append(IOUtils.toString(new InputStreamReader(tais)));
return true;
}
}
return false;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
@Override
public boolean hasNext() {
return hasNext;
}
@Override
public String next() {
final String data = currentItem.toString();
hasNext = getNextItem();
return data;
}
@Override
public void close() throws IOException {
gzipInputStream.close();
}
}

View File

@ -0,0 +1,14 @@
[
{
"paramName": "i",
"paramLongName": "inputPath",
"paramDescription": "the path of the input json",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,58 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>sparkExecutorNumber</name>
<value>4</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>sparkDriverMemory</name>
<value>15G</value>
</property>
<property>
<name>sparkExecutorMemory</name>
<value>6G</value>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
</property>
</configuration>

View File

@ -0,0 +1,53 @@
<workflow-app name="Update_RAiD_action_set" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>raidJsonInputPath</name>
<description>the path of the json</description>
</property>
<property>
<name>raidActionSetPath</name>
<description>path where to store the action set</description>
</property>
</parameters>
<start to="deleteoutputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="deleteoutputpath">
<fs>
<delete path='${raidActionSetPath}'/>
<mkdir path='${raidActionSetPath}'/>
</fs>
<ok to="processRAiDFile"/>
<error to="Kill"/>
</action>
<action name="processRAiDFile">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ProcessRAiDFile</name>
<class>eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${raidJsonInputPath}</arg>
<arg>--outputPath</arg><arg>${raidActionSetPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -37,7 +37,7 @@ case class mappingAuthor(
family: Option[String], family: Option[String],
sequence: Option[String], sequence: Option[String],
ORCID: Option[String], ORCID: Option[String],
affiliation: Option[mappingAffiliation] affiliation: Option[List[mappingAffiliation]]
) {} ) {}
case class funderInfo(id: String, uri: String, name: String, synonym: List[String]) {} case class funderInfo(id: String, uri: String, name: String, synonym: List[String]) {}
@ -457,15 +457,14 @@ case object Crossref2Oaf {
} }
//Mapping Author //Mapping Author
val authorList: List[mappingAuthor] = val authorList: List[mappingAuthor] = (json \ "author").extract[List[mappingAuthor]].filter(a => a.family.isDefined)
(json \ "author").extract[List[mappingAuthor]].filter(a => a.family.isDefined)
val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) => val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) =>
a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first") a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")
) )
result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) => result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) =>
generateAuhtor(a.given.orNull, a.family.get, a.ORCID.orNull, index) generateAuthor(a.given.orNull, a.family.get, a.ORCID.orNull, index, a.affiliation)
}.asJava) }.asJava)
// Mapping instance // Mapping instance
@ -504,19 +503,6 @@ case object Crossref2Oaf {
) )
} }
val is_review = json \ "relation" \ "is-review-of" \ "id"
if (is_review != JNothing) {
instance.setInstancetype(
OafMapperUtils.qualifier(
"0015",
"peerReviewed",
ModelConstants.DNET_REVIEW_LEVELS,
ModelConstants.DNET_REVIEW_LEVELS
)
)
}
if (doi.startsWith("10.3410") || doi.startsWith("10.12703")) if (doi.startsWith("10.3410") || doi.startsWith("10.12703"))
instance.setHostedby( instance.setHostedby(
OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true), "H1Connect") OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true), "H1Connect")
@ -574,12 +560,23 @@ case object Crossref2Oaf {
s"50|doiboost____|$id" s"50|doiboost____|$id"
} }
def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = { private def generateAuthor(
given: String,
family: String,
orcid: String,
index: Int,
affiliation: Option[List[mappingAffiliation]]
): Author = {
val a = new Author val a = new Author
a.setName(given) a.setName(given)
a.setSurname(family) a.setSurname(family)
a.setFullname(s"$given $family") a.setFullname(s"$given $family")
a.setRank(index + 1) a.setRank(index + 1)
// Adding Raw affiliation if it's defined
if (affiliation.isDefined) {
a.setRawAffiliationString(affiliation.get.map(a => a.name).asJava)
}
if (StringUtils.isNotBlank(orcid)) if (StringUtils.isNotBlank(orcid))
a.setPid( a.setPid(
List( List(
@ -705,7 +702,15 @@ case object Crossref2Oaf {
val objectType = (json \ "type").extractOrElse[String](null) val objectType = (json \ "type").extractOrElse[String](null)
if (objectType == null) if (objectType == null)
return resultList return resultList
val typology = getTypeQualifier(objectType, vocabularies)
// If the item has a relations is-review-of, then we force it to a peer-review
val is_review = json \ "relation" \ "is-review-of" \ "id"
var force_to_review = false
if (is_review != JNothing) {
force_to_review = true
}
val typology = getTypeQualifier(if (force_to_review) "peer-review" else objectType, vocabularies)
if (typology == null) if (typology == null)
return List() return List()
@ -757,33 +762,6 @@ case object Crossref2Oaf {
else else
resultList resultList
} }
// if (uw != null) {
// result.getCollectedfrom.add(createUnpayWallCollectedFrom())
// val i: Instance = new Instance()
// i.setCollectedfrom(createUnpayWallCollectedFrom())
// if (uw.best_oa_location != null) {
//
// i.setUrl(List(uw.best_oa_location.url).asJava)
// if (uw.best_oa_location.license.isDefined) {
// i.setLicense(field[String](uw.best_oa_location.license.get, null))
// }
//
// val colour = get_unpaywall_color(uw.oa_status)
// if (colour.isDefined) {
// val a = new AccessRight
// a.setClassid(ModelConstants.ACCESS_RIGHT_OPEN)
// a.setClassname(ModelConstants.ACCESS_RIGHT_OPEN)
// a.setSchemeid(ModelConstants.DNET_ACCESS_MODES)
// a.setSchemename(ModelConstants.DNET_ACCESS_MODES)
// a.setOpenAccessRoute(colour.get)
// i.setAccessright(a)
// }
// i.setPid(result.getPid)
// result.getInstance().add(i)
// }
// }
} }
private def createCiteRelation(source: Result, targetPid: String, targetPidType: String): List[Relation] = { private def createCiteRelation(source: Result, targetPid: String, targetPidType: String): List[Relation] = {

View File

@ -0,0 +1,165 @@
package eu.dnetlib.dhp.actionmanager.raid;
import static java.nio.file.Files.createTempDirectory;
import static eu.dnetlib.dhp.actionmanager.Constants.OBJECT_MAPPER;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.File;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.actionmanager.opencitations.CreateOpenCitationsASTest;
import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class GenerateRAiDActionSetJobTest {
private static String input_path;
private static String output_path;
static SparkSession spark;
@BeforeEach
void setUp() throws Exception {
input_path = Paths
.get(
GenerateRAiDActionSetJobTest.class
.getResource("/eu/dnetlib/dhp/actionmanager/raid/raid_example.json")
.toURI())
.toFile()
.getAbsolutePath();
output_path = createTempDirectory(GenerateRAiDActionSetJobTest.class.getSimpleName() + "-")
.toAbsolutePath()
.toString();
SparkConf conf = new SparkConf();
conf.setAppName(GenerateRAiDActionSetJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", output_path);
conf.set("hive.metastore.warehouse.dir", output_path);
spark = SparkSession
.builder()
.appName(GenerateRAiDActionSetJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
static void cleanUp() throws Exception {
FileUtils.deleteDirectory(new File(output_path));
}
@Test
@Disabled
void testProcessRAiDEntities() {
GenerateRAiDActionSetJob.processRAiDEntities(spark, input_path, output_path + "/test_raid_action_set");
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<? extends Oaf> result = sc
.sequenceFile(output_path + "/test_raid_action_set", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(AtomicAction::getPayload);
assertEquals(80, result.count());
}
@Test
void testPrepareRAiD() {
List<AtomicAction<? extends Oaf>> atomicActions = GenerateRAiDActionSetJob
.prepareRAiD(
new RAiDEntity(
"-92190526",
Arrays
.asList(
"Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura",
"Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume",
"Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont",
"Maïeul GRUGET", "Cécile Duchêne"),
"2021-09-10",
"2024-02-16",
Arrays
.asList(
"cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps",
"pan-scalar map", "Python library", "QGIS", "map design", "landmarks",
"Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]",
"[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography",
"eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency",
"General Medicine", "Geography, Planning and Development", "multi-scales",
"pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences",
"progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design",
"cartography, map generalisation, zoom, multi-scale map", "Interactive maps",
"Map generalisation", "Earth and Planetary Sciences (miscellaneous)",
"Cartographic generalization", "rivers", "Benchmark", "General Environmental Science",
"open source", "drawing", "Constraint", "Multi-scale maps"),
Arrays
.asList(
"Where do people look at during multi-scale map tasks?", "FogDetector survey raw data",
"Collection of cartographic disorientation stories", "Anchorwhat dataset",
"BasqueRoads: A Benchmark for Road Network Selection",
"Progressive river network selection for pan-scalar maps",
"BasqueRoads, a dataset to benchmark road selection algorithms",
"Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps",
"Empirical approach to advance the generalisation of multi-scale maps",
"L'Alpe d'Huez: a dataset to benchmark topographic map generalisation",
"eye-tracking data from a survey on zooming in a pan-scalar map",
"Material of the experiment 'More is Less' from the MapMuxing project",
"Cartagen4py, an open source Python library for map generalisation",
"LAlpe dHuez: A Benchmark for Topographic Map Generalisation"),
Arrays
.asList(
"50|doi_dedup___::6915135e0aa39f913394513f809ae58a",
"50|doi_dedup___::754e3c283639bc6e104c925ff3e34007",
"50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0",
"50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a",
"50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153",
"50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a",
"50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13",
"50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4",
"50|doi_dedup___::a9bc4453273b2d02648a5cb453195042",
"50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7",
"50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5",
"50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283",
"50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea",
"50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"),
"Exploring Multi-Scale Map Generalization and Design",
"This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval."));
OtherResearchProduct orp = (OtherResearchProduct) atomicActions.get(0).getPayload();
Relation rel = (Relation) atomicActions.get(1).getPayload();
assertEquals("Exploring Multi-Scale Map Generalization and Design", orp.getTitle().get(0).getValue());
assertEquals("50|raid________::759a564ce5cc7360cab030c517c7366b", rel.getSource());
assertEquals("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", rel.getTarget());
}
}

View File

@ -0,0 +1,35 @@
package eu.dnetlib.dhp.collection.plugin.zenodo;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.util.zip.GZIPInputStream;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class ZenodoPluginCollectionTest {
@Test
public void testZenodoIterator() throws Exception {
final GZIPInputStream gis = new GZIPInputStream(
getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/zenodo/zenodo.tar.gz"));
try (ZenodoTarIterator it = new ZenodoTarIterator(gis)) {
Assertions.assertTrue(it.hasNext());
int i = 0;
while (it.hasNext()) {
Assertions.assertNotNull(it.next());
i++;
}
Assertions.assertEquals(10, i);
}
}
}

View File

@ -0,0 +1,6 @@
{"raid": "-9222092103004099540", "authors": ["Department of Archaeology & Museums", "Department of Archaeology and Museums", "Department Of Archaeology & Museums"], "subjects": ["Begamganj", "Raisen", "Bhopal", "Budhni", "Malwa site survey", "सीहोर", "Gauharganj", "बुधनी", "Budni", "Berasia"], "titles": ["Malwa site survey : Raisen District, Begamganj Tahsīl, photographic documentation", "Malwa site survey : Bhopal District, photographic documentation (version 1, TIFF files)", "Malwa site survey : Raisen District, Gauharganj Tahsīl, village finds", "Malwa site survey : Sehore सीहोर District, Budni Tahsīl, photographic documentation (part 1)", "Malwa site survey: Bhopal District, Berasia Tahsīl, photographic documentation (with villages named)", "Malwa site survey : Sehore सीहोर District, Budni Tahsīl, photographic documentation (part 2)", "Malwa site survey : Bhopal District, photographic documentation (version 2, JPEG files)"], "ids": ["50|doi_dedup___::7523d165970830dd857e6cbea4302adf", "50|doi_dedup___::02309ae8a9fae291df321e317f5c5330", "50|doi_dedup___::95347ba2c4264414fab39712ee7fe481", "50|doi_dedup___::970aa708fe667596754fd02a708780f5", "50|doi_dedup___::b7cd9128cc53b1257a4f000347f339b0", "50|doi_dedup___::c7d65da0ecedef4d2c702b9db197d90c", "50|doi_dedup___::addbb67cf5046e340f342ba091bcebfa"], "title": "Documentation of Malwa Region", "summary": "This project involves the documentation of the Malwa region through photographic surveys. The surveys were conducted by the Department of Archaeology and Museums, Madhya Pradesh, and cover various districts and tahsils. The documentation includes photographic records of sites, villages, and other relevant features. The project aims to provide a comprehensive understanding of the region's cultural and historical significance.", "startDate": "2019-03-06", "endDate": "2019-03-08"}
{"raid": "-9221424331076109424", "authors": ["Hutchings, Judy", "Ward, Catherine", "Baban, Adriana", "D<><44>nil<69><6C>, Ingrid", "Frantz, Inga", "Gardner, Frances", "Lachman, Jamie", "Lachman, Jamie M.", "Foran, Heather", "Heinrichs, Nina", "Murphy, Hugh", "B<><42>ban, Adriana", "Raleva, Marija", "Fang, Xiangming", "Jansen, Elena", "Taut, Diana", "Foran, Heather M.", "T<><54>ut, Diana", "Ward, Catherine L.", "Williams, Margiad", "Lesco, Galina", "Brühl, Antonia"], "subjects": ["3. Good health", "5. Gender equality", "Criminology not elsewhere classified", "1. No poverty", "2. Zero hunger"], "titles": ["sj-docx-1-vaw-10.1177_10778012231188090 - Supplemental material for Co-Occurrence of Intimate Partner Violence Against Mothers and Maltreatment of Their Children With Behavioral Problems in Eastern Europe", "Hunger in vulnerable families in Southeastern Europe: Associations with health and violence", "Prevention of child mental health problems through parenting interventions in Southeastern Europe (RISE): study protocol for a multi-site randomised controlled trial"], "ids": ["50|doi_dedup___::a70015063e5400dae2e097ee10b4a589", "50|doi_dedup___::6e1d12026fcde9087724622ccdeed430", "50|doi_dedup___::5b7bd5d46c5d95e2ef5b36663504a67e"], "title": "Exploring the Impact of Hunger and Violence on Child Health in Southeastern Europe", "summary": "This study aims to investigate the relationship between hunger, violence, and child health in vulnerable families in Southeastern Europe. The research will explore the experiences of families in FYR Macedonia, Republic of Moldova, and Romania, and examine the associations between hunger, maltreatment, and other health indicators. The study will also test the efficacy of a parenting intervention targeting child behavioral problems in alleviating these issues. The findings of this research will contribute to the development of effective interventions to address the complex needs of vulnerable families in the region.", "startDate": "2019-06-04", "endDate": "2023-01-01"}
{"raid": "-9219052635741785098", "authors": ["Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura", "Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume", "Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont", "Maïeul GRUGET", "Cécile Duchêne"], "subjects": ["cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps", "pan-scalar map", "Python library", "QGIS", "map design", "landmarks", "Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]", "[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography", "eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency", "General Medicine", "Geography, Planning and Development", "multi-scales", "pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences", "progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design", "cartography, map generalisation, zoom, multi-scale map", "Interactive maps", "Map generalisation", "Earth and Planetary Sciences (miscellaneous)", "Cartographic generalization", "rivers", "Benchmark", "General Environmental Science", "open source", "drawing", "Constraint", "Multi-scale maps"], "titles": ["Where do people look at during multi-scale map tasks?", "FogDetector survey raw data", "Collection of cartographic disorientation stories", "Anchorwhat dataset", "BasqueRoads: A Benchmark for Road Network Selection", "Progressive river network selection for pan-scalar maps", "BasqueRoads, a dataset to benchmark road selection algorithms", "Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps", "Empirical approach to advance the generalisation of multi-scale maps", "L'Alpe d'Huez: a dataset to benchmark topographic map generalisation", "eye-tracking data from a survey on zooming in a pan-scalar map", "Material of the experiment \"More is Less\" from the MapMuxing project", "Cartagen4py, an open source Python library for map generalisation", "LAlpe dHuez: A Benchmark for Topographic Map Generalisation"], "ids": ["50|doi_dedup___::6915135e0aa39f913394513f809ae58a", "50|doi_dedup___::754e3c283639bc6e104c925ff3e34007", "50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0", "50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a", "50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153", "50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a", "50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13", "50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4", "50|doi_dedup___::a9bc4453273b2d02648a5cb453195042", "50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7", "50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5", "50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283", "50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea", "50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"], "title": "Exploring Multi-Scale Map Generalization and Design", "summary": "This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval.", "startDate": "2021-09-10", "endDate": "2024-02-16"}
{"raid": "-9216828847055450272", "authors": ["Grey, Alan", "Gorelov, Sergey", "Pall, Szilard", "Merz, Pascal", "Justin A., Lemkul", "Szilárd Páll", "Pasquadibisceglie, Andrea", "Kutzner, Carsten", "Schulz, Roland", "Nabet, Julien", "Abraham, Mark", "Jalalypour, Farzaneh", "Lundborg, Magnus", "Gray, Alan", "Villa, Alessandra", "Berk Hess", "Santuz, Hubert", "Irrgang, M. Eric", "Wingbermuehle, Sebastian", "Lemkul, Justin A.", "Jordan, Joe", "Pellegrino, Michele", "Doijade, Mahesh", "Shvetsov, Alexey", "Hess, Berk", "Behera, Sudarshan", "Andrey Alekseenko", "Shugaeva, Tatiana", "Fleischmann, Stefan", "Bergh, Cathrine", "Morozov, Dmitry", "Adam Hospital", "Briand, Eliane", "Lindahl, Erik", "Brown, Ania", "Marta Lloret Llinares", "Miletic, Vedran", "Alekseenko, Andrey", "Gouaillardet, Gilles", "Fiorin, Giacomo", "Basov, Vladimir"], "subjects": ["webinar"], "titles": ["Introduction to HPC: molecular dynamics simulations with GROMACS: log files", "BioExcel webinar #73: Competency frameworks to support training design and professional development", "Introduction to HPC: molecular dynamics simulations with GROMACS: output files - Devana", "GROMACS 2024.0 Manual", "BioExcel Webinar #71: GROMACS-PMX for accurate estimation of free energy differences", "Introduction to HPC: molecular dynamics simulations with GROMACS: input files", "BioExcel Webinar #68: What's new in GROMACS 2023", "BioExcel Webinar #69: BioBB-Wfs and BioBB-API, integrated web-based platform and programmatic interface for biomolecular simulations workflows using the BioExcel Building Blocks library", "GROMACS 2024-beta Source code"], "ids": ["50|doi_dedup___::8318fbc815ee1943c3269be7567f220b", "50|doi_dedup___::9530e03fb2aac63e82b18a40dc09e32c", "50|doi_dedup___::30174ab31075e76a428ca5b4f4d236b8", "50|doi_________::70b7c6dce09ae6f1361d22913fdf95eb", "50|doi_dedup___::337dd48600618f3c06257edd750d6201", "50|doi_dedup___::d622992ba9077617f37ebd268b3e806d", "50|doi_dedup___::0b0bcc6825d6c052c37882fd5cfc1e8c", "50|doi_dedup___::4b1541a7cee32527c65ace5d1ed57335", "50|doi_dedup___::1379861df59bd755e4fb39b9f95ffbd3"], "title": "Exploring High-Performance Computing and Biomolecular Simulations", "summary": "This project involves exploring high-performance computing (HPC) and biomolecular simulations using GROMACS. The objectives include understanding molecular dynamics simulations, log files, input files, and output files. Additionally, the project aims to explore competency frameworks for professional development, specifically in the field of computational biomolecular research. The tools and techniques used will include GROMACS, BioExcel Building Blocks, and competency frameworks. The expected outcomes include a deeper understanding of HPC and biomolecular simulations, as well as the development of skills in using GROMACS and BioExcel Building Blocks. The project will also contribute to the development of competency frameworks for professional development in the field of computational biomolecular research.", "startDate": "2023-04-25", "endDate": "2024-01-30"}
{"raid": "-9210544816395499758", "authors": ["Bateson, Melissa", "Andrews, Clare", "Verhulst, Simon", "Nettle, Daniel", "Zuidersma, Erica"], "subjects": ["2. Zero hunger"], "titles": ["Exposure to food insecurity increases energy storage and reduces somatic maintenance in European starlings", "Data and code archive for Andrews et al. 'Exposure to food insecurity increases energy storage and reduces somatic maintenance in European starlings'"], "ids": ["50|doi_dedup___::176117239be06189523c253e0ca9c5ec", "50|doi_dedup___::343e0b0ddf0d54763a89a62af1f7a379"], "title": "Investigating the Effects of Food Insecurity on Energy Storage and Somatic Maintenance in European Starlings", "summary": "This study examines the impact of food insecurity on energy storage and somatic maintenance in European starlings. The research involved exposing juvenile starlings to either uninterrupted food availability or a regime of unpredictable food unavailability. The results show that birds exposed to food insecurity stored more energy, but at the expense of somatic maintenance and repair. The study provides insights into the adaptive responses of birds to food scarcity and the trade-offs involved in energy storage and maintenance.", "startDate": "2021-06-28", "endDate": "2021-06-28"}
{"raid": "-9208499171224730388", "authors": ["Maniati, Eleni", "Bakker, Bjorn", "McClelland, Sarah E.", "Shaikh, Nadeem", "De Angelis, Simone", "Johnson, Sarah C.", "Wang, Jun", "Foijer, Floris", "Spierings, Diana C. J.", "Boemo, Michael A.", "Wardenaar, René", "Mazzagatti, Alice"], "subjects": [], "titles": ["Additional file 2 of Replication stress generates distinctive landscapes of DNA copy number alterations and chromosome scale losses", "Additional file 5 of Replication stress generates distinctive landscapes of DNA copy number alterations and chromosome scale losses"], "ids": ["50|doi_dedup___::a1bfeb173971f74a274fab8bdd78a4bc", "50|doi_dedup___::3d6e151aaeb2f7c40a320207fdd80ade"], "title": "Analysis of DNA Copy Number Alterations and Chromosome Scale Losses", "summary": "This study analyzed the effects of replication stress on DNA copy number alterations and chromosome scale losses. The results show distinctive landscapes of these alterations and losses, which were further investigated in additional files. The study provides valuable insights into the mechanisms of replication stress and its impact on genomic stability.", "startDate": "2022-01-01", "endDate": "2022-01-01"}

View File

@ -0,0 +1,232 @@
{
"indexed": {
"date-parts": [
[
2022,
4,
3
]
],
"date-time": "2022-04-03T01:45:59Z",
"timestamp": 1648950359167
},
"reference-count": 0,
"publisher": "American Society of Clinical Oncology (ASCO)",
"issue": "18_suppl",
"content-domain": {
"domain": [],
"crossmark-restriction": false
},
"short-container-title": [
"JCO"
],
"published-print": {
"date-parts": [
[
2007,
6,
20
]
]
},
"abstract": "<jats:p> 3507 </jats:p><jats:p> Purpose: To detect IGF-1R on circulating tumor cells (CTCs) as a biomarker in the clinical development of a monoclonal human antibody, CP-751,871, targeting IGF-1R. Experimental Design: An automated sample preparation and analysis system for enumerating CTCs (Celltracks) was adapted for detecting IGF-1R positive CTCs with a diagnostic antibody targeting a different IGF-1R epitope to CP-751,871. This assay was utilized in three phase I trials of CP-751,871 as a single agent or with chemotherapy and was validated using cell lines and blood samples from healthy volunteers and patients with metastatic carcinoma. Results: There was no interference between the analytical and therapeutic antibodies. CP-751,871 was well tolerated as a single agent, and in combination with docetaxel or carboplatin and paclitaxel, at doses ranging from 0.05 mg/kg to 20 mg/kg. Eighty patients were enrolled on phase 1 studies of CP-751,871, with 47 (59%) patients having CTCs detected during the study. Prior to treatment 26 patients (33%) had CTCs, with 23 having detectable IGF-1R positive CTCs. CP-751,871 alone, and CP-751,871 with cytotoxic chemotherapy, decreased CTCs and IGF-1R positive CTCs; these increased towards the end of the 21-day cycle in some patients, falling again with retreatment. CTCs were commonest in advanced hormone refractory prostate cancer (11/20). Detectable IGF-1R expression on CTCs before treatment with CP-751,871 and docetaxel was associated with a higher frequency of PSA decline by more than 50% (6/10 vs 2/8 patients). A relationship was observed between sustained falls in CTCs counts and PSA declines by more than 50%. Conclusions: IGF-1R expression is detectable by immunofluorescence on CTCs. These data support the further evaluation of CTCs in pharmacodynamic studies and patient selection, particularly in advanced prostate cancer. </jats:p><jats:p> No significant financial relationships to disclose. </jats:p>",
"DOI": "10.1200/jco.2007.25.18_suppl.3507",
"type": "journal-article",
"created": {
"date-parts": [
[
2020,
3,
6
]
],
"date-time": "2020-03-06T20:50:42Z",
"timestamp": 1583527842000
},
"page": "3507-3507",
"source": "Crossref",
"is-referenced-by-count": 0,
"title": [
"Circulating tumor cells expressing the insulin growth factor-1 receptor (IGF-1R): Method of detection, incidence and potential applications"
],
"prefix": "10.1200",
"volume": "25",
"author": [
{
"given": "J. S.",
"family": "de Bono",
"sequence": "first",
"affiliation": [
{
"name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA"
}
]
},
{
"given": "A.",
"family": "Adjei",
"sequence": "additional",
"affiliation": [
{
"name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA"
}
]
},
{
"given": "G.",
"family": "Attard",
"sequence": "additional",
"affiliation": [
{
"name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA"
}
]
},
{
"given": "M.",
"family": "Pollak",
"sequence": "additional",
"affiliation": [
{
"name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA"
}
]
},
{
"given": "P.",
"family": "Fong",
"sequence": "additional",
"affiliation": [
{
"name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA"
}
]
},
{
"given": "P.",
"family": "Haluska",
"sequence": "additional",
"affiliation": [
{
"name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA"
}
]
},
{
"given": "L.",
"family": "Roberts",
"sequence": "additional",
"affiliation": [
{
"name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA"
}
]
},
{
"given": "D.",
"family": "Chainese",
"sequence": "additional",
"affiliation": [
{
"name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA"
}
]
},
{
"given": "L.",
"family": "Terstappen",
"sequence": "additional",
"affiliation": [
{
"name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA"
}
]
},
{
"given": "A.",
"family": "Gualberto",
"sequence": "additional",
"affiliation": [
{
"name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA"
}
]
}
],
"member": "233",
"container-title": [
"Journal of Clinical Oncology"
],
"original-title": [],
"language": "en",
"deposited": {
"date-parts": [
[
2020,
3,
6
]
],
"date-time": "2020-03-06T20:51:03Z",
"timestamp": 1583527863000
},
"score": 1,
"resource": {
"primary": {
"URL": "http://ascopubs.org/doi/10.1200/jco.2007.25.18_suppl.3507"
}
},
"subtitle": [],
"short-title": [],
"issued": {
"date-parts": [
[
2007,
6,
20
]
]
},
"references-count": 0,
"journal-issue": {
"issue": "18_suppl",
"published-print": {
"date-parts": [
[
2007,
6,
20
]
]
}
},
"alternative-id": [
"10.1200/jco.2007.25.18_suppl.3507"
],
"URL": "http://dx.doi.org/10.1200/jco.2007.25.18_suppl.3507",
"relation": {},
"ISSN": [
"0732-183X",
"1527-7755"
],
"issn-type": [
{
"value": "0732-183X",
"type": "print"
},
{
"value": "1527-7755",
"type": "electronic"
}
],
"subject": [],
"published": {
"date-parts": [
[
2007,
6,
20
]
]
}
}

View File

@ -3,12 +3,15 @@ package eu.dnetlib.dhp.collection.crossref
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
import eu.dnetlib.dhp.schema.oaf.Publication
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.junit.jupiter.api.{BeforeEach, Test} import org.junit.jupiter.api.{Assertions, BeforeEach, Test}
import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.api.extension.ExtendWith
import org.mockito.junit.jupiter.MockitoExtension import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters.asScalaBufferConverter
@ExtendWith(Array(classOf[MockitoExtension])) @ExtendWith(Array(classOf[MockitoExtension]))
class CrossrefMappingTest extends AbstractVocabularyTest { class CrossrefMappingTest extends AbstractVocabularyTest {
@ -25,8 +28,32 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
val input = val input =
IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8") IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All)) Crossref2Oaf
.convert(input, vocabularies, TransformationType.All)
.foreach(record => {
Assertions.assertNotNull(record)
})
} }
@Test
def mappingAffiliation(): Unit = {
val input =
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/affiliationTest.json"),
"utf-8"
)
val data = Crossref2Oaf.convert(input, vocabularies, TransformationType.OnlyResult)
data.foreach(record => {
Assertions.assertNotNull(record)
Assertions.assertTrue(record.isInstanceOf[Publication])
val publication = record.asInstanceOf[Publication]
publication.getAuthor.asScala.foreach(author => {
Assertions.assertNotNull(author.getRawAffiliationString)
Assertions.assertTrue(author.getRawAffiliationString.size() > 0)
})
})
println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(data.head))
}
} }

View File

@ -171,7 +171,7 @@ public class Utils implements Serializable {
public static List<String> getCommunityIdList(String baseURL) throws IOException { public static List<String> getCommunityIdList(String baseURL) throws IOException {
return getValidCommunities(baseURL) return getValidCommunities(baseURL)
.stream() .stream()
.map(community -> community.getId()) .map(CommunityModel::getId)
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -1,11 +1,14 @@
package eu.dnetlib.dhp.resulttocommunityfromsemrel; package eu.dnetlib.dhp.resulttocommunityfromsemrel;
import static java.lang.String.join;
import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.List; import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -19,6 +22,7 @@ import com.google.gson.Gson;
import eu.dnetlib.dhp.api.Utils; import eu.dnetlib.dhp.api.Utils;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
@ -45,7 +49,7 @@ public class PrepareResultCommunitySetStep1 {
/** /**
* a dataset for example could be linked to more than one publication. For each publication linked to that dataset * a dataset for example could be linked to more than one publication. For each publication linked to that dataset
* the previous query will produce a row: targetId set of community context the target could possibly inherit with * the previous query will produce a row: targetId, set of community context the target could possibly inherit. With
* the following query there will be a single row for each result linked to more than one result of the result type * the following query there will be a single row for each result linked to more than one result of the result type
* currently being used * currently being used
*/ */
@ -56,6 +60,27 @@ public class PrepareResultCommunitySetStep1 {
+ "where length(co) > 0 " + "where length(co) > 0 "
+ "group by resultId"; + "group by resultId";
private static final String RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO = "select target as resultId, community_context "
+
"from resultWithContext rwc " +
"join relatedToRelations r " +
"join patents p " +
"on rwc.id = r.source and r.target = p.id";
private static final String RESULT_WITH_CONTEXT = "select id, collect_set(co.id) community_context \n" +
" from result " +
" lateral view explode (context) c as co " +
" where lower(co.id) IN %s" +
" group by id";
private static final String RESULT_PATENT = "select id " +
" from result " +
" where array_contains(instance.instancetype.classname, 'Patent')";
private static final String IS_RELATED_TO_RELATIONS = "select source, target " +
" from relation " +
" where lower(relClass) = 'isrelatedto' and datainfo.deletedbyinference = false";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
@ -82,14 +107,25 @@ public class PrepareResultCommunitySetStep1 {
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); final String allowedsemrel = "(" + join(
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); ",",
Arrays
.asList(parser.get("allowedsemrels").split(";"))
.stream()
.map(value -> "'" + value.toLowerCase() + "'")
.toArray(String[]::new))
+ ")";
log.info("allowedSemRel: {}", allowedsemrel);
final String baseURL = parser.get("baseURL"); final String baseURL = parser.get("baseURL");
log.info("baseURL: {}", baseURL); log.info("baseURL: {}", baseURL);
final List<String> communityIdList = getCommunityList(baseURL); final String communityIdList = "(" + join(
log.info("communityIdList: {}", new Gson().toJson(communityIdList)); ",", getCommunityList(baseURL)
.stream()
.map(value -> "'" + value.toLowerCase() + "'")
.toArray(String[]::new))
+ ")";
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
log.info("resultType: {}", resultType); log.info("resultType: {}", resultType);
@ -118,10 +154,10 @@ public class PrepareResultCommunitySetStep1 {
SparkSession spark, SparkSession spark,
String inputPath, String inputPath,
String outputPath, String outputPath,
List<String> allowedsemrel, String allowedsemrel,
Class<R> resultClazz, Class<R> resultClazz,
String resultType, String resultType,
List<String> communityIdList) { String communityIdList) {
final String inputResultPath = inputPath + "/" + resultType; final String inputResultPath = inputPath + "/" + resultType;
log.info("Reading Graph table from: {}", inputResultPath); log.info("Reading Graph table from: {}", inputResultPath);
@ -132,7 +168,8 @@ public class PrepareResultCommunitySetStep1 {
Dataset<Relation> relation = readPath(spark, inputRelationPath, Relation.class); Dataset<Relation> relation = readPath(spark, inputRelationPath, Relation.class);
relation.createOrReplaceTempView("relation"); relation.createOrReplaceTempView("relation");
Dataset<R> result = readPath(spark, inputResultPath, resultClazz); Dataset<R> result = readPath(spark, inputResultPath, resultClazz)
.where("datainfo.deletedbyinference != true AND datainfo.invisible != true");
result.createOrReplaceTempView("result"); result.createOrReplaceTempView("result");
final String outputResultPath = outputPath + "/" + resultType; final String outputResultPath = outputPath + "/" + resultType;
@ -141,10 +178,20 @@ public class PrepareResultCommunitySetStep1 {
String resultContextQuery = String String resultContextQuery = String
.format( .format(
RESULT_CONTEXT_QUERY_TEMPLATE, RESULT_CONTEXT_QUERY_TEMPLATE,
getConstraintList(" lower(co.id) = '", communityIdList), "AND lower(co.id) IN " + communityIdList,
getConstraintList(" lower(relClass) = '", allowedsemrel)); "AND lower(relClass) IN " + allowedsemrel);
Dataset<Row> result_context = spark.sql(resultContextQuery); Dataset<Row> result_context = spark.sql(resultContextQuery);
Dataset<Row> rwc = spark.sql(String.format(RESULT_WITH_CONTEXT, communityIdList));
Dataset<Row> patents = spark.sql(RESULT_PATENT);
Dataset<Row> relatedToRelations = spark.sql(IS_RELATED_TO_RELATIONS);
rwc.createOrReplaceTempView("resultWithContext");
patents.createOrReplaceTempView("patents");
relatedToRelations.createOrReplaceTempView("relatedTorelations");
result_context = result_context.unionAll(spark.sql(RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO));
result_context.createOrReplaceTempView("result_context"); result_context.createOrReplaceTempView("result_context");
spark spark
@ -152,8 +199,9 @@ public class PrepareResultCommunitySetStep1 {
.as(Encoders.bean(ResultCommunityList.class)) .as(Encoders.bean(ResultCommunityList.class))
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.mode(SaveMode.Overwrite) .mode(SaveMode.Append)
.json(outputResultPath); .json(outputResultPath);
} }
public static List<String> getCommunityList(final String baseURL) throws IOException { public static List<String> getCommunityList(final String baseURL) throws IOException {

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.resulttocommunityfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
@ -76,22 +77,13 @@ public class PrepareResultCommunitySetStep2 {
if (b == null) { if (b == null) {
return a; return a;
} }
Set<String> community_set = new HashSet<>(); Set<String> community_set = new HashSet<>(a.getCommunityList());
a.getCommunityList().stream().forEach(aa -> community_set.add(aa)); community_set.addAll(b.getCommunityList());
b a.setCommunityList(new ArrayList<>(community_set));
.getCommunityList()
.stream()
.forEach(
aa -> {
if (!community_set.contains(aa)) {
a.getCommunityList().add(aa);
community_set.add(aa);
}
});
return a; return a;
}) })
.map(Tuple2::_2) .map(Tuple2::_2)
.map(r -> OBJECT_MAPPER.writeValueAsString(r)) .map(OBJECT_MAPPER::writeValueAsString)
.saveAsTextFile(outputPath, GzipCodec.class); .saveAsTextFile(outputPath, GzipCodec.class);
} }

View File

@ -6,7 +6,9 @@ import static org.apache.spark.sql.functions.desc;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -24,7 +26,9 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Dataset;
import scala.collection.Seq;
public class ResultToCommunityJobTest { public class ResultToCommunityJobTest {
@ -271,4 +275,59 @@ public class ResultToCommunityJobTest {
.get(0) .get(0)
.getString(0)); .getString(0));
} }
@Test
public void prepareStep1Test() throws Exception {
/*
* final String allowedsemrel = join(",", Arrays.stream(parser.get("allowedsemrels").split(";")) .map(value ->
* "'" + value.toLowerCase() + "'") .toArray(String[]::new)); log.info("allowedSemRel: {}", new
* Gson().toJson(allowedsemrel)); final String baseURL = parser.get("baseURL"); log.info("baseURL: {}",
* baseURL);
*/
PrepareResultCommunitySetStep1
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", getClass()
.getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph")
.getPath(),
"-hive_metastore_uris", "",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath", workingDir.toString() + "/preparedInfo",
"-allowedsemrels", "issupplementto;issupplementedby",
"-baseURL", "https://dev-openaire.d4science.org/openaire/community/"
});
org.apache.spark.sql.Dataset<ResultCommunityList> resultCommunityList = spark
.read()
.schema(Encoders.bean(ResultCommunityList.class).schema())
.json(workingDir.toString() + "/preparedInfo/publication")
.as(Encoders.bean(ResultCommunityList.class));
Assertions.assertEquals(2, resultCommunityList.count());
Assertions
.assertEquals(
1,
resultCommunityList.filter("resultId = '50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783'").count());
Assertions
.assertEquals(
1,
resultCommunityList.filter("resultId = '50|pending_org_::82f63b2d21ae88596b9d8991780e9888'").count());
ArrayList<String> communities = resultCommunityList
.filter("resultId = '50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783'")
.first()
.getCommunityList();
Assertions.assertEquals(2, communities.size());
Assertions.assertTrue(communities.stream().anyMatch(cid -> "beopen".equals(cid)));
Assertions.assertTrue(communities.stream().anyMatch(cid -> "dh-ch".equals(cid)));
communities = resultCommunityList
.filter("resultId = '50|pending_org_::82f63b2d21ae88596b9d8991780e9888'")
.first()
.getCommunityList();
Assertions.assertEquals(1, communities.size());
Assertions.assertEquals("dh-ch", communities.get(0));
}
} }

View File

@ -0,0 +1,24 @@
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"issupplementedby","relType":"resultOrganization","source":"50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6","subRelType":"affiliation","target":"50|pending_org_::82f63b2d21ae88596b9d8991780e9888","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"issupplementedby","relType":"resultOrganization","source":"50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6","subRelType":"affiliation","target":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|opendoar____::f0dd4a99fba6075a9494772b58f95280","subRelType":"affiliation","target":"20|openorgs____::322ff2a6524820640bc5d1311871585e","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539","subRelType":"affiliation","target":"20|openorgs____::58e60f1715d219aa6757ba0b0f2ccbce","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","subRelType":"affiliation","source":"10|issn___print::a7a2010e75d849442790955162ef4e42","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|issn___print::a7a2010e75d849442790955162ef4e43","subRelType":"affiliation","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|issn___print::a7a2010e75d849442790955162ef4e44","subRelType":"affiliation","target":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|issn___print::a7a2010e75d849442790955162ef4e45","subRelType":"affiliation","target":"20|pending_org_::c522a7c935f9fd9578122e60eeec282c","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isrelatedto","relType":"resultOrganization","source":"50|openorgs____::64badd35233ba2cd4946368ef2f4cf57","subRelType":"affiliation","target":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","subRelType":"affiliation","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isrelatedto","relType":"resultOrganization","source":"50|355e65625b88::74009c567c81b4aa55c813db658734df","subRelType":"affiliation","target":"50|dedup_wf_001::08d6f2001319c86d0e69b0f83ad75df2","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::08d6f2001319c86d0e69b0f83ad75df2","subRelType":"affiliation","target":"20|openorgs____::91a81877815afb4ebf25c1a3f3b03c5d","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","subRelType":"affiliation","target":"50|dedup_wf_001::0a1cdf269375d32ce341fdeb0e92dfa8","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0a1cdf269375d32ce341fdeb0e92dfa8","subRelType":"affiliation","target":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","subRelType":"affiliation","target":"50|dedup_wf_001::0ab92bed024ee6883c7a1244722e5eec","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0ab92bed024ee6883c7a1244722e5eec","subRelType":"affiliation","target":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","subRelType":"affiliation","target":"50|dedup_wf_001::0ca26c736ad4d15b3d5ee90a4d7853e1","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0ca26c736ad4d15b3d5ee90a4d7853e1","subRelType":"affiliation","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","subRelType":"affiliation","target":"50|dedup_wf_001::0ef8dfab3927cb4d69df0d3113f05a42","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0ef8dfab3927cb4d69df0d3113f05a42","subRelType":"affiliation","target":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","subRelType":"affiliation","target":"50|dedup_wf_001::0f488ad00253126c14a21abe6b2d406c","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0f488ad00253126c14a21abe6b2d406c","subRelType":"affiliation","target":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|pending_org_::c522a7c935f9fd9578122e60eeec282c","subRelType":"affiliation","target":"50|dedup_wf_001::12206bf78aabd7d52132477182d19147","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::12206bf78aabd7d52132477182d19147","subRelType":"affiliation","target":"20|pending_org_::c522a7c935f9fd9578122e60eeec282c","validated":false}