[EOSC Dump] added support for the extention of the results by adding the affiliation information

This commit is contained in:
Miriam Baglioni 2022-09-21 17:24:37 +02:00
parent ae10ae9793
commit 31fa465f6a
12 changed files with 538 additions and 96 deletions

View File

@ -49,4 +49,12 @@ public class EoscResult extends CommunityResult {
public void setSubject(Map<String, List<Subject>> subject) {
this.subject = subject;
}
public List<Organization> getAffiliation() {
return affiliation;
}
public void setAffiliation(List<Organization> affiliation) {
this.affiliation = affiliation;
}
}

View File

@ -0,0 +1,148 @@
package eu.dnetlib.dhp.oa.graph.dump.eosc;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.eosc.model.EoscResult;
import eu.dnetlib.dhp.eosc.model.OrganizationPid;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Result;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 27/07/22
*/
public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ExtendEoscResultWithOrganizationStep2.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
ExtendEoscResultWithOrganizationStep2.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/eosc_extend_result_with_organization_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String resultPath = parser.get("resultPath");
log.info("resultPath: {}", resultPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
addOrganizations(spark, inputPath, outputPath, resultPath, inputClazz);
});
}
private static <R extends Result> void addOrganizations(SparkSession spark, String inputPath, String outputPath, String resultPath, Class<R> inputClazz) {
Dataset<EoscResult> results = Utils
.readPath(spark, resultPath , EoscResult.class);
Dataset<Relation> relations = Utils
.readPath(spark, inputPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible() && r.getSubRelType().equalsIgnoreCase(ModelConstants.AFFILIATION));
Dataset<Organization> organizations = Utils.readPath(spark, inputPath + "/organization" , Organization.class);
Dataset<ResultOrganizations> resultOrganization = relations.joinWith(organizations, relations.col("source").equalTo(organizations.col("id")), "left")
.map((MapFunction<Tuple2<Relation, Organization>, ResultOrganizations>) t2 -> {
if (t2._2() != null) {
ResultOrganizations rOrg = new ResultOrganizations();
rOrg.setResultId(t2._1().getTarget());
eu.dnetlib.dhp.eosc.model.Organization org = new eu.dnetlib.dhp.eosc.model.Organization();
org.setId(t2._2().getId());
org.setName(t2._2().getLegalname().getValue());
HashMap<String, Set<String>> organizationPids = new HashMap<>();
t2._2().getPid().forEach(p -> {
if(!organizationPids.containsKey(p.getQualifier().getClassid()))
organizationPids.put(p.getQualifier().getClassid(), new HashSet<>());
organizationPids.get(p.getQualifier().getClassid()).add(p.getValue());
});
List<OrganizationPid> pids = new ArrayList<>();
for(String key : organizationPids.keySet() ){
for(String value: organizationPids.get(key)){
OrganizationPid pid = new OrganizationPid();
pid.setValue(value);
pid.setType(key);
pids.add(pid);
}
}
org.setPid(pids);
rOrg.setAffiliation(org);
return rOrg;
}
return null;
}, Encoders.bean(ResultOrganizations.class))
.filter(Objects::nonNull);
results.joinWith(resultOrganization, results.col("id").equalTo(resultOrganization.col("resultId")), "left")
.groupByKey((MapFunction<Tuple2<EoscResult, ResultOrganizations>, String>) t2 -> t2._1().getId() , Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Tuple2<EoscResult, ResultOrganizations>, EoscResult>) (s, it) -> {
Tuple2<EoscResult, ResultOrganizations> first = it.next();
if (first._2() == null){
return first._1();
}
EoscResult ret = first._1();
List<eu.dnetlib.dhp.eosc.model.Organization> affiliation = new ArrayList<>();
affiliation.add(first._2().getAffiliation());
it.forEachRemaining(res -> affiliation.add(res._2().getAffiliation()));
ret.setAffiliation(affiliation);
return ret;
}, Encoders.bean(EoscResult.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath);
}
}

View File

@ -0,0 +1,31 @@
package eu.dnetlib.dhp.oa.graph.dump.eosc;
import eu.dnetlib.dhp.eosc.model.Organization;
import java.io.Serializable;
import java.util.List;
/**
* @author miriam.baglioni
* @Date 20/09/22
*/
public class ResultOrganizations implements Serializable {
private String resultId;
private Organization affiliation;
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public Organization getAffiliation() {
return affiliation;
}
public void setAffiliation(Organization affiliation) {
this.affiliation = affiliation;
}
}

View File

@ -1,83 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.eosc;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
import eu.dnetlib.dhp.schema.oaf.Relation;
/**
* @author miriam.baglioni
* @Date 27/07/22
*/
public class SelectEoscRelationsStep2 implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SelectEoscRelationsStep2.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SelectEoscRelationsStep2.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/reletion_selection_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String resultPath = parser.get("resultPath");
log.info("resultPath: {}", resultPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, resultPath + "/relation");
selectRelations(spark, inputPath, resultPath + "/relation", resultPath);
});
}
private static void selectRelations(SparkSession spark, String inputPath, String outputPath, String resultPath) {
Dataset<GraphResult> results = Utils
.readPath(spark, resultPath + "/publication", GraphResult.class)
.union(
Utils
.readPath(spark, resultPath + "/dataset", GraphResult.class))
.union(
Utils
.readPath(spark, resultPath + "/software", GraphResult.class))
.union(
Utils
.readPath(spark, resultPath + "/otherresearchproduct", GraphResult.class));
Dataset<Relation> relations = Utils
.readPath(spark, inputPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible());
}
}

View File

@ -80,14 +80,15 @@ public class SelectEoscResultsJobStep1 implements Serializable {
.readPath(spark, inputPath, inputClazz)
.filter(
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() && !r.getDataInfo().getInvisible()
&& (r.getContext().stream().anyMatch(c -> c.getId().equals("eosc")) ||
Optional
.ofNullable(r.getSubject())
.map(
s -> s
.stream()
.anyMatch(sbj -> sbj.getValue().equalsIgnoreCase("EOSC::RO-crate")))
.orElse(false)))
&& r.getContext().stream().anyMatch(c -> c.getId().equals("eosc")) )
// ||
// Optional
// .ofNullable(r.getSubject())
// .map(
// s -> s
// .stream()
// .anyMatch(sbj -> sbj.getValue().equalsIgnoreCase("EOSC::RO-crate")))
// .orElse(false)))
.map(
(MapFunction<R, EoscResult>) r -> (EoscResult) ResultMapper
.map(r, communityMap, Constants.DUMPTYPE.EOSC.getType()),

View File

@ -0,0 +1,36 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName":"rp",
"paramLongName":"resultPath",
"paramDescription": "The path to the community map",
"paramRequired": true
}
]

View File

@ -132,6 +132,31 @@
<arg>--outputPath</arg><arg>${workingDir}/dump/publication</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
</spark>
<ok to="extend_eosc_publication"/>
<error to="Kill"/>
</action>
<action name="extend_eosc_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend Dump Publication For EOSC with affiliations </name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.ExtendEoscResultWithOrganizationStep2</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/publicationextended</arg>
</spark>
<ok to="wait_eosc_dump"/>
<error to="Kill"/>
</action>
@ -157,6 +182,31 @@
<arg>--outputPath</arg><arg>${workingDir}/dump/dataset</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
</spark>
<ok to="extend_eosc_dataset"/>
<error to="Kill"/>
</action>
<action name="extend_eosc_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend Dump Dataset For EOSC with affiliation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.ExtendEoscResultWithOrganizationStep2</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/datasetextended</arg>
</spark>
<ok to="wait_eosc_dump"/>
<error to="Kill"/>
</action>
@ -182,6 +232,31 @@
<arg>--outputPath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
</spark>
<ok to="extend_eosc_orp"/>
<error to="Kill"/>
</action>
<action name="extend_eosc_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend Dump Otherresearchproduct For EOSC with affiliation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.ExtendEoscResultWithOrganizationStep2</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/otherresearchproductextended</arg>
</spark>
<ok to="wait_eosc_dump"/>
<error to="Kill"/>
</action>
@ -207,10 +282,34 @@
<arg>--outputPath</arg><arg>${workingDir}/dump/software</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
</spark>
<ok to="extend_eosc_software"/>
<error to="Kill"/>
</action>
<action name="extend_eosc_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extend Dump Software For EOSC with affiliation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.eosc.ExtendEoscResultWithOrganizationStep2</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultPath</arg><arg>${workingDir}/dump/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/softwareextended</arg>
</spark>
<ok to="wait_eosc_dump"/>
<error to="Kill"/>
</action>
<join name="wait_eosc_dump" to="prepareResultProject"/>
<action name="prepareResultProject">
@ -261,7 +360,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/dump/publication</arg>
<arg>--sourcePath</arg><arg>${workingDir}/dump/publicationextended</arg>
<arg>--outputPath</arg><arg>${workingDir}/tar/publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
<arg>--dumpType</arg><arg>eosc</arg>
@ -287,7 +386,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/dump/dataset</arg>
<arg>--sourcePath</arg><arg>${workingDir}/dump/datasetextended</arg>
<arg>--outputPath</arg><arg>${workingDir}/tar/dataset</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
<arg>--dumpType</arg><arg>eosc</arg>
@ -313,7 +412,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
<arg>--sourcePath</arg><arg>${workingDir}/dump/otherresearchproductextended</arg>
<arg>--outputPath</arg><arg>${workingDir}/tar/otherresearchproduct</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
<arg>--dumpType</arg><arg>eosc</arg>
@ -339,7 +438,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/dump/software</arg>
<arg>--sourcePath</arg><arg>${workingDir}/dump/softwareextended</arg>
<arg>--outputPath</arg><arg>${workingDir}/tar/software</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
<arg>--dumpType</arg><arg>eosc</arg>

View File

@ -0,0 +1,175 @@
package eu.dnetlib.dhp.oa.graph.dump.eosc;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.eosc.model.EoscResult;
import eu.dnetlib.dhp.eosc.model.Organization;
import eu.dnetlib.dhp.oa.graph.dump.complete.SelectRelationTest;
import eu.dnetlib.dhp.oa.graph.dump.complete.SparkSelectValidRelationsJob;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
/**
* @author miriam.baglioni
* @Date 21/09/22
*/
public class SelectEoscResultTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(SelectEoscResultTest.class);
private static HashMap<String, String> map = new HashMap<>();
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(SelectEoscResultTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(SelectRelationTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(SelectRelationTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void selectEoscResults() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/input/publication")
.getPath();
final String cmp = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
.getPath();
SelectEoscResultsJobStep1.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/publication",
"-sourcePath", sourcePath,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-communityMapPath", cmp
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<EoscResult> tmp = sc
.textFile(workingDir.toString() + "/publication")
.map(item -> OBJECT_MAPPER.readValue(item, EoscResult.class));
Assertions.assertEquals(3, tmp.count());
Assertions.assertEquals(0, tmp.filter(r -> Optional.ofNullable(r.getAffiliation()).isPresent() && r.getAffiliation().size() > 0).count());
}
// "source":"20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff","subRelType":"affiliation","target":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba"
// legalname = Doris Engineering (France)
// pid = [(pidtype:"GRID","value":"grid.432986.2"),("classid":"ROR","value":"https://ror.org/03nd0ms94"),("classid":"GRID","value":"grid.432986.2"),("classid":"ROR","value":"https://ror.org/03nd0ms94")]
// sono replicate e ci dovrebbero essere uniche nel dump
// "source":"20|MetisRadboud::b58bdbe8ae5acead04fc76777d2f8017","subRelType":"affiliation","target":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba"
// legalname = RENNES METROPOLE
//pid = [(PIC, 892062829)]
// "source":"20|____________::d1b0ee22411434cf905692d0fac25749","subRelType":"affiliation","target":"50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98"
//legalname = MIKARE RESEARCH
//pid = []
//for 50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f no affiliation relation is provided
@Test
public void ExtendEoscResultWithOrganizationTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/input")
.getPath();
final String cmp = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
.getPath();
String resultPath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/working/publication")
.getPath();
ExtendEoscResultWithOrganizationStep2.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/publication",
"-sourcePath", sourcePath,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-resultPath", resultPath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<EoscResult> tmp = sc
.textFile(workingDir.toString() + "/publication")
.map(item -> OBJECT_MAPPER.readValue(item, EoscResult.class));
Assertions.assertEquals(3, tmp.count());
Assertions.assertEquals(2, tmp.filter(r -> Optional.ofNullable(r.getAffiliation()).isPresent() && r.getAffiliation().size() > 0).count());
Assertions.assertEquals(2, tmp.filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")).first().getAffiliation().size());
List<Organization> affiliations = tmp.filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")).first().getAffiliation();
Assertions.assertTrue(affiliations.stream().anyMatch(a -> a.getName().equalsIgnoreCase("Doris Engineering (France)")));
Assertions.assertTrue(affiliations.stream().anyMatch(a -> a.getName().equalsIgnoreCase("RENNES METROPOLE")));
Organization organization = affiliations.stream().filter(a -> a.getId().equalsIgnoreCase("20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff")).findFirst().get();
Assertions.assertEquals("Doris Engineering (France)", organization.getName());
Assertions.assertTrue(organization.getPid().stream().anyMatch(p -> p.getValue().equalsIgnoreCase("grid.432986.2") && p.getType().equalsIgnoreCase("grid")));
Assertions.assertTrue(organization.getPid().stream().anyMatch(p -> p.getValue().equalsIgnoreCase("https://ror.org/03nd0ms94") && p.getType().equalsIgnoreCase("ror")));
Assertions.assertEquals(2, organization.getPid().size());
organization = affiliations.stream().filter(a -> a.getId().equalsIgnoreCase("20|MetisRadboud::b58bdbe8ae5acead04fc76777d2f8017")).findFirst().get();
Assertions.assertEquals("RENNES METROPOLE", organization.getName());
Assertions.assertEquals(1, organization.getPid().size());
Assertions.assertTrue(organization.getPid().get(0).getValue().equalsIgnoreCase("892062829") && organization.getPid().get(0).getType().equalsIgnoreCase("pic"));
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")).first().getAffiliation().size());
Assertions.assertEquals("MIKARE RESEARCH", tmp.filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")).first().getAffiliation().get(0).getName());
Assertions.assertEquals(0, tmp.filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")).first().getAffiliation().get(0).getPid().size() );
Assertions.assertFalse( Optional.ofNullable(tmp.filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f")).first().getAffiliation()).isPresent());
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,12 @@
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1658466717543,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff","subRelType":"affiliation","target":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1658466715268,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff","subRelType":"affiliation","target":"50|illesrfpubli::f7a12dbc69edc80dad0d03cd70194a50","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1658466740559,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff","subRelType":"affiliation","target":"50|od______1575::c301a67861dc4f123e9db3e707292f51","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1658466732165,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff","subRelType":"affiliation","target":"50|pmid________::adb1767dee3e12a15e63d5d714878450","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1658466716451,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|668c0d5724b9::4bfb57b2efb5028c8d4261e11efe6a77","subRelType":"affiliation","target":"50|od_______624::eaa14ce67036c248fedf17aecf721a62","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1658466726873,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|6800d980759b::148dba7689611d62d933e66adc2e9c03","subRelType":"affiliation","target":"50|pmid________::b14f9e560ff8325dca48da40eb5598a6","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.8998"},"lastupdatetimestamp":1658466730052,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|MetisRadboud::b58bdbe8ae5acead04fc76777d2f8017","subRelType":"affiliation","target":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.8886"},"lastupdatetimestamp":1658466725950,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|VTTRsInSsCrs::735915884eb439d42953372eaf934782","subRelType":"affiliation","target":"50|pmc_________::76dce1630801318b69ff81f1703a4fc9","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.8847"},"lastupdatetimestamp":1658466741040,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|____________::d1b0ee22411434cf905692d0fac25749","subRelType":"affiliation","target":"50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.8847"},"lastupdatetimestamp":1658466737372,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|____________::d1b0ee22411434cf905692d0fac25749","subRelType":"affiliation","target":"50|pmid________::3a5bb2b50c18e755cbe67b9ca7d821ee","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.8998"},"lastupdatetimestamp":1658466717565,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|aka_________::04ab269cfcf6bd571b6285151ec554b5","subRelType":"affiliation","target":"50|nora_uio__no::01152f3e683765695bbad68fc692b85e","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.8998"},"lastupdatetimestamp":1658466733174,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|aka_________::0838366fa1df3c1599ddefc2168ada5d","subRelType":"affiliation","target":"50|arXiv_______::abe2b16af6067994dda4beab6410b35d","validated":false}

View File

@ -0,0 +1,3 @@
{"author":[{"fullname":"Levande, Paul","name":"Paul","surname":"Levande","rank":1,"pid":null}],"type":"publication","language":{"code":"eng","label":"English"},"country":[],"maintitle":"Special Cases of the Parking Functions Conjecture and Upper-Triangular Matrices","subtitle":null,"description":["We examine the $q=1$ and $t=0$ special cases of the parking functions conjecture. The parking functions conjecture states that the Hilbert series for the space of diagonal harmonics is equal to the bivariate generating function of $area$ and $dinv$ over the set of parking functions. Haglund recently proved that the Hilbert series for the space of diagonal harmonics is equal to a bivariate generating function over the set of Tesler matricesupper-triangular matrices with every hook sum equal to one. We give a combinatorial interpretation of the Haglund generating function at $q=1$ and prove the corresponding case of the parking functions conjecture (first proven by Garsia and Haiman). We also discuss a possible proof of the $t = 0$ case consistent with this combinatorial interpretation. We conclude by briefly discussing possible refinements of the parking functions conjecture arising from this research and point of view. $\\textbf{Note added in proof}$: We have since found such a proof of the $t = 0$ case and conjectured more detailed refinements. This research will most likely be presented in full in a forthcoming article.","On examine les cas spéciaux $q=1$ et $t=0$ de la conjecture des fonctions de stationnement. Cette conjecture déclare que la série de Hilbert pour l'espace des harmoniques diagonaux est égale à la fonction génératrice bivariée (paramètres $area$ et $dinv$) sur l'ensemble des fonctions de stationnement. Haglund a prouvé récemment que la série de Hilbert pour l'espace des harmoniques diagonaux est égale à une fonction génératrice bivariée sur l'ensemble des matrices de Tesler triangulaires supérieures dont la somme de chaque équerre vaut un. On donne une interprétation combinatoire de la fonction génératrice de Haglund pour $q=1$ et on prouve le cas correspondant de la conjecture dans le cas des fonctions de stationnement (prouvé d'abord par Garsia et Haiman). On discute aussi d'une preuve possible du cas $t=0$, cohérente avec cette interprétation combinatoire. On conclut en discutant brièvement les raffinements possibles de la conjecture des fonctions de stationnement de ce point de vue. $\\textbf{Note ajoutée sur épreuve}$: j'ai trouvé depuis cet article une preuve du cas $t=0$ et conjecturé des raffinements possibles. Ces résultats seront probablement présentés dans un article ultérieur."],"publicationdate":"2011-01-01","publisher":null,"embargoenddate":null,"source":["ISSN: 1365-8050","Discrete Mathematics & Theoretical Computer Science","Episciences.org","dmtcs:2940 - Discrete Mathematics & Theoretical Computer Science, 2011-01-01, DMTCS Proceedings vol. AO, 23rd International Conference on Formal Power Series and Algebraic Combinatorics (FPSAC 2011)"],"format":[],"contributor":["Coordination Episciences iam"],"coverage":[],"bestaccessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/"},"container":null,"documentationUrl":null,"codeRepositoryUrl":null,"programmingLanguage":null,"contactperson":null,"contactgroup":null,"tool":null,"size":null,"version":null,"geolocation":null,"id":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba","originalId":["oai:episciences.org:dmtcs:2940"],"pid":[],"dateofcollection":"2022-04-12T19:57:46.9Z","lastupdatetimestamp":1663599091226,"projects":null,"context":null,"instance":[{"measures":null,"pid":[],"alternateIdentifier":[{"scheme":"doi","value":"10.46298/dmtcs.2940"}],"license":null,"accessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/","openAccessRoute":null},"type":"Article","url":["https://dmtcs.episciences.org/2940"],"articleprocessingcharge":null,"publicationdate":"2011-01-01","refereed":"UNKNOWN","hostedby":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"}}],"eoscIF":null,"subject":{},"keywords":["parking function","Hilbert series","diagonal harmonics","[MATH.MATH-CO] Mathematics [math]/Combinatorics [math.CO]","[INFO.INFO-DM] Computer Science [cs]/Discrete Mathematics [cs.DM]"],"affiliation":null}
{"author":[{"fullname":"Blondin, Michael","name":"Michael","surname":"Blondin","rank":1,"pid":null},{"fullname":"Raskin, Mikhail","name":"Mikhail","surname":"Raskin","rank":2,"pid":null}],"type":"publication","language":{"code":"und","label":"Undetermined"},"country":[],"maintitle":"The Complexity of Reachability in Affine Vector Addition Systems with States","subtitle":null,"description":["Vector addition systems with states (VASS) are widely used for the formalverification of concurrent systems. Given their tremendous computationalcomplexity, practical approaches have relied on techniques such as reachabilityrelaxations, e.g., allowing for negative intermediate counter values. It isnatural to question their feasibility for VASS enriched with primitives thattypically translate into undecidability. Spurred by this concern, we pinpointthe complexity of integer relaxations with respect to arbitrary classes ofaffine operations. More specifically, we provide a trichotomy on the complexity of integerreachability in VASS extended with affine operations (affine VASS). Namely, weshow that it is NP-complete for VASS with resets, PSPACE-complete for VASS with(pseudo-)transfers and VASS with (pseudo-)copies, and undecidable for any otherclass. We further present a dichotomy for standard reachability in affine VASS:it is decidable for VASS with permutations, and undecidable for any otherclass. This yields a complete and unified complexity landscape of reachabilityin affine VASS. We also consider the reachability problem parameterized by afixed affine VASS, rather than a class, and we show that the complexitylandscape is arbitrary in this setting."],"publicationdate":"2021-07-20","publisher":null,"embargoenddate":null,"source":["ISSN: 1860-5974","Logical Methods in Computer Science","Episciences.org","lmcs:6872 - Logical Methods in Computer Science, 2021-07-20, Volume 17, Issue 3"],"format":[],"contributor":["Michael Blondin"],"coverage":[],"bestaccessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/"},"container":null,"documentationUrl":null,"codeRepositoryUrl":null,"programmingLanguage":null,"contactperson":null,"contactgroup":null,"tool":null,"size":null,"version":null,"geolocation":null,"id":"50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98","originalId":["oai:episciences.org:lmcs:7687"],"pid":[],"dateofcollection":"2022-04-12T19:57:21.4Z","lastupdatetimestamp":1663599096765,"projects":null,"context":null,"instance":[{"measures":null,"pid":[],"alternateIdentifier":[{"scheme":"doi","value":"10.46298/lmcs-17(3:3)2021"}],"license":null,"accessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/","openAccessRoute":null},"type":"Article","url":["https://lmcs.episciences.org/7687"],"articleprocessingcharge":null,"publicationdate":"2021-07-20","refereed":"UNKNOWN","hostedby":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"}}],"eoscIF":null,"subject":{},"keywords":["Computer Science - Logic in Computer Science","Computer Science - Computational Complexity","Computer Science - Formal Languages and Automata Theory"],"affiliation":null}
{"author":[{"fullname":"Ward, Mark Daniel","name":"Mark Daniel","surname":"Ward","rank":1,"pid":null},{"fullname":"Szpankowski, Wojciech","name":"Wojciech","surname":"Szpankowski","rank":2,"pid":null}],"type":"publication","language":{"code":"eng","label":"English"},"country":[],"maintitle":"Analysis of the multiplicity matching parameter in suffix trees","subtitle":null,"description":["In a suffix tree, the multiplicity matching parameter (MMP) $M_n$ is the number of leaves in the subtree rooted at the branching point of the $(n+1)$st insertion. Equivalently, the MMP is the number of pointers into the database in the Lempel-Ziv '77 data compression algorithm. We prove that the MMP asymptotically follows the logarithmic series distribution plus some fluctuations. In the proof we compare the distribution of the MMP in suffix trees to its distribution in tries built over independent strings. Our results are derived by both probabilistic and analytic techniques of the analysis of algorithms. In particular, we utilize combinatorics on words, bivariate generating functions, pattern matching, recurrence relations, analytical poissonization and depoissonization, the Mellin transform, and complex analysis."],"publicationdate":"2005-01-01","publisher":null,"embargoenddate":null,"source":["ISSN: 1365-8050","Discrete Mathematics & Theoretical Computer Science","Episciences.org","dmtcs:3387 - Discrete Mathematics & Theoretical Computer Science, 2005-01-01, DMTCS Proceedings vol. AD, International Conference on Analysis of Algorithms"],"format":[],"contributor":["Coordination Episciences iam"],"coverage":[],"bestaccessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/"},"container":null,"documentationUrl":null,"codeRepositoryUrl":null,"programmingLanguage":null,"contactperson":null,"contactgroup":null,"tool":null,"size":null,"version":null,"geolocation":null,"id":"50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f","originalId":["oai:episciences.org:dmtcs:3387"],"pid":[],"dateofcollection":"2022-04-12T19:57:43.247Z","lastupdatetimestamp":1663599101233,"projects":null,"context":null,"instance":[{"measures":null,"pid":[],"alternateIdentifier":[{"scheme":"doi","value":"10.46298/dmtcs.3387"}],"license":null,"accessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/","openAccessRoute":null},"type":"Article","url":["https://dmtcs.episciences.org/3387"],"articleprocessingcharge":null,"publicationdate":"2005-01-01","refereed":"UNKNOWN","hostedby":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"}}],"eoscIF":null,"subject":{},"keywords":["data compression","complex asymptotics","suffix trees","combinatorics on words","pattern matching","autocorrelation polynomial","[INFO.INFO-DS] Computer Science [cs]/Data Structures and Algorithms [cs.DS]","[INFO.INFO-DM] Computer Science [cs]/Discrete Mathematics [cs.DM]","[MATH.MATH-CO] Mathematics [math]/Combinatorics [math.CO]","[INFO.INFO-CG] Computer Science [cs]/Computational Geometry [cs.CG]"],"affiliation":null}