add the information of the eosc datasource id to the instance level

This commit is contained in:
Miriam Baglioni 2023-09-26 11:14:45 +02:00
parent a7866a26f7
commit 483214e78f
14 changed files with 529 additions and 93 deletions

View File

@ -66,6 +66,16 @@ public class Instance implements Serializable {
private String fulltext;
private List<String> eoscDsId;
public List<String> getEoscDsId() {
return eoscDsId;
}
public void setEoscDsId(List<String> eoscId) {
this.eoscDsId = eoscId;
}
public String getFulltext() {
return fulltext;
}

View File

@ -74,7 +74,7 @@ public class DumpProducts implements Serializable {
return null;
}
return ResultMapper.map(value, communityMap);
return ResultMapper.map(value, communityMap, null);
}
}

View File

@ -6,6 +6,11 @@ import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -18,6 +23,7 @@ import eu.dnetlib.dhp.eosc.model.Measure;
import eu.dnetlib.dhp.eosc.model.OpenAccessRoute;
import eu.dnetlib.dhp.eosc.model.Provenance;
import eu.dnetlib.dhp.eosc.model.Result;
import eu.dnetlib.dhp.oa.graph.dump.eosc.MasterDuplicate;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -27,9 +33,11 @@ public class ResultMapper implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ResultMapper.class);
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
E in, Map<String, String> communityMap)
E in, Map<String, String> communityMap,
List<MasterDuplicate> eoscIds)
throws NoAvailableEntityTypeException, CardinalityTooHighException {
log.info("*****************" + eoscIds.size());
Result out = new Result();
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
@ -49,7 +57,7 @@ public class ResultMapper implements Serializable {
mapFormat(out, input);
out.setId(input.getId());
mapOriginalId(out, input);
mapInstance(out, input);
mapInstance(out, input, eoscIds);
mapLamguage(out, input);
mapLastUpdateTimestamp(out, input);
mapTitle(out, input);
@ -255,7 +263,8 @@ public class ResultMapper implements Serializable {
}
}
private static void mapInstance(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
private static void mapInstance(Result out, eu.dnetlib.dhp.schema.oaf.Result input,
List<MasterDuplicate> eoscIds) {
if (Optional
.ofNullable(input.getInstance())
.isPresent()) {
@ -264,7 +273,7 @@ public class ResultMapper implements Serializable {
input
.getInstance()
.stream()
.map(i -> getCommunityInstance(i, input.getResulttype().getClassid()))
.map(i -> getCommunityInstance(i, input.getResulttype().getClassid(), eoscIds))
.collect(Collectors.toList()));
}
@ -538,7 +547,7 @@ public class ResultMapper implements Serializable {
}
private static eu.dnetlib.dhp.eosc.model.Instance getCommunityInstance(eu.dnetlib.dhp.schema.oaf.Instance i,
String resultType) {
String resultType, List<MasterDuplicate> eoscIds) {
eu.dnetlib.dhp.eosc.model.Instance instance = new eu.dnetlib.dhp.eosc.model.Instance();
setCommonValue(i, instance);
@ -547,6 +556,27 @@ public class ResultMapper implements Serializable {
.setHostedby(
CfHbKeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
List<MasterDuplicate> eoscDsIds = eoscIds
.stream()
.filter(
dm -> dm
.getGraphId()
.equals(i.getHostedby().getKey()) ||
dm
.getGraphId()
.equals(i.getCollectedfrom().getKey()))
.collect(Collectors.toList());
if (eoscDsIds.size() > 0) {
instance
.setEoscDsId(
eoscDsIds
.stream()
.map(dm -> dm.getEoscId())
.collect(Collectors.toList()));
}
if (resultType.equals("publication") ||
resultType.equals("other")) {
if (Optional.ofNullable(i.getFulltext()).isPresent())

View File

@ -1,19 +1,8 @@
package eu.dnetlib.dhp.oa.graph.dump.eosc;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import javax.rmi.CORBA.Util;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
@ -21,63 +10,93 @@ import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import javax.rmi.CORBA.Util;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.catalyst.plans.logical.ColumnStat;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Datasource;
/**
* @author miriam.baglioni
* @Date 20/09/23
*/
public class EoscDatasourceId implements Serializable {
private static final Logger log = LoggerFactory.getLogger(EoscDatasourceId.class);
private static String EOSC = "10|openaire____::2e06c1122c7df43765fdcf91080824fa";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SaveCommunityMap.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/eosc_identifiers_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
private static final Logger log = LoggerFactory.getLogger(EoscDatasourceId.class);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SaveCommunityMap.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/eosc_identifiers_parameters.json"));
final String nameNode = parser.get("nameNode");
log.info("nameNode: {}", nameNode);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
mapEoscIdentifier(spark, inputPath, outputPath);
});
}
SparkConf conf = new SparkConf();
private static void mapEoscIdentifier(SparkSession spark, String inputPath, String outputPath){
spark.read()
.schema(Encoders.bean(Datasource.class).schema())
.json(inputPath + "/datasource")
.withColumn("collfrom", functions.explode(new Column("collectedfrom.key")))
.filter("collfrom == " + EOSC)
.withColumn("orId", functions.explode(new Column("originalId")))
.filter(new Column("orId").startsWith("eosc"))
.select(
new Column("id").as("graphId"),
new Column("orId").as("eoscId"));
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
mapEoscIdentifier(spark, inputPath, outputPath);
});
}
}
private static void mapEoscIdentifier(SparkSession spark, String inputPath, String outputPath) {
final StructType structureSchema = new StructType()
.add("masterId", DataTypes.StringType)
.add("masterName", DataTypes.StringType)
.add("duplicateId", DataTypes.StringType);
org.apache.spark.sql.Dataset<Row> df = spark.read().schema(structureSchema).parquet(inputPath);
// spark
// .read()
// .schema(Encoders.bean(Datasource.class).schema())
// .json(inputPath + "/datasource")
// .withColumn("orId", functions.explode(new Column("originalId")))
df
.filter(new Column("duplicateId").startsWith("eosc"))
.select(
new Column("masterId").as("graphId"),
functions.substring_index(new Column("duplicateId"), "::", -1).as("eoscId"))
.filter((FilterFunction<Row>) r -> !((String) r.getAs("graphId")).startsWith(("10|eosc")))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.save(outputPath);
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.oa.graph.dump.eosc;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
/**
* @author miriam.baglioni
* @Date 25/09/23
*/
public class EoscMasterDuplicate {
private static final Logger log = LoggerFactory.getLogger(EoscMasterDuplicate.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
EoscMasterDuplicate.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/datasourcemaster_parameters.json")));
parser.parseArgument(args);
final String dbUrl = parser.get("postgresUrl");
log.info("postgresUrl: {}", dbUrl);
final String dbUser = parser.get("postgresUser");
log.info("postgresUser: {}", dbUser);
final String dbPassword = parser.get("postgresPassword");
log.info("postgresPassword: {}", dbPassword);
final String hdfsPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", hdfsPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode: {}", hdfsNameNode);
ReadDatasourceMasterDuplicateFromDB.execute(dbUrl, dbUser, dbPassword, hdfsPath, hdfsNameNode);
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.oa.graph.dump.eosc;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 25/09/23
*/
public class MasterDuplicate implements Serializable {
private String eoscId;
private String graphId;
private String graphName;
public String getEoscId() {
return eoscId;
}
public void setEoscId(String eoscId) {
this.eoscId = eoscId;
}
public String getGraphId() {
return graphId;
}
public void setGraphId(String graphId) {
this.graphId = graphId;
}
public String getGraphName() {
return graphName;
}
public void setGraphName(String graphName) {
this.graphName = graphName;
}
}

View File

@ -0,0 +1,128 @@
//
// Source code recreated from a .class file by IntelliJ IDEA
// (powered by FernFlower decompiler)
//
package eu.dnetlib.dhp.oa.graph.dump.eosc;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.sql.SQLException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class ReadDatasourceMasterDuplicateFromDB {
private static final Logger log = LoggerFactory.getLogger(ReadDatasourceMasterDuplicateFromDB.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);";
public ReadDatasourceMasterDuplicateFromDB() {
}
public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
throws IOException {
int count = 0;
DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword);
Throwable var7 = null;
try {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
FSDataOutputStream fos = fileSystem.create(new Path(hdfsPath));
log
.info(
"running query: {}",
"SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);");
log.info("storing results in: {}", hdfsPath);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8));
Throwable var12 = null;
try {
dbClient
.processResults(
"SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);",
(rs) -> {
writeMap(datasourceMasterMap(rs), writer);
});
++count;
} catch (Throwable var35) {
var12 = var35;
throw var35;
} finally {
if (writer != null) {
if (var12 != null) {
try {
writer.close();
} catch (Throwable var34) {
var12.addSuppressed(var34);
}
} else {
writer.close();
}
}
}
} catch (Throwable var37) {
var7 = var37;
throw var37;
} finally {
if (dbClient != null) {
if (var7 != null) {
try {
dbClient.close();
} catch (Throwable var33) {
var7.addSuppressed(var33);
}
} else {
dbClient.close();
}
}
}
return count;
}
private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
try {
MasterDuplicate md = new MasterDuplicate();
String duplicateId = rs.getString("duplicateId");
String masterId = rs.getString("masterId");
String masterName = rs.getString("masterName");
if (duplicateId.startsWith("eosc")) {
md.setEoscId(duplicateId.substring(duplicateId.lastIndexOf("::") + 2));
md.setGraphId(OafMapperUtils.createOpenaireId(10, masterId, true));
md.setGraphName(masterName);
return md;
}
return null;
} catch (SQLException var5) {
throw new RuntimeException(var5);
}
}
private static void writeMap(MasterDuplicate dm, BufferedWriter writer) {
if (dm == null)
return;
try {
writer.write(OBJECT_MAPPER.writeValueAsString(dm));
writer.newLine();
} catch (IOException var3) {
throw new RuntimeException(var3);
}
}
}

View File

@ -4,15 +4,16 @@ package eu.dnetlib.dhp.oa.graph.dump.eosc;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -49,6 +50,9 @@ public class SelectEoscResultsJobStep1 implements Serializable {
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String eoscDatasourceIdsPath = parser.get("eoscDatasourceIdsPath");
log.info("eoscDatasourceIdsPath: {}", eoscDatasourceIdsPath);
final String communityMapPath = parser.get("communityMapPath");
log.info("communityMapPath: {}", communityMapPath);
@ -65,13 +69,29 @@ public class SelectEoscResultsJobStep1 implements Serializable {
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
selectEoscResults(spark, inputPath, outputPath, inputClazz, communityMapPath);
selectEoscResults(spark, inputPath, outputPath, inputClazz, communityMapPath, eoscDatasourceIdsPath);
});
}
private static <R extends eu.dnetlib.dhp.schema.oaf.Result> void selectEoscResults(SparkSession spark,
String inputPath, String outputPath,
Class<R> inputClazz, String communityMapPath) {
Class<R> inputClazz, String communityMapPath, String eoscDatasourceIdsPath) {
// final StructType structureSchema = new StructType()
// .add("eoscId", DataTypes.StringType)
// .add("graphId", DataTypes.StringType)
// .add("graphName", DataTypes.StringType);
//
// // .fromDDL("`graphId`: STRING, `eoscId`:STRING");
// org.apache.spark.sql.Dataset<Row> df = spark
// .read()
// .schema(structureSchema)
// .json(eoscDatasourceIdsPath);
List<MasterDuplicate> df = Utils
.readPath(spark, eoscDatasourceIdsPath, MasterDuplicate.class)
.collectAsList();
log.info("number of rows ************: " + df.size());
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
Utils
.readPath(spark, inputPath, inputClazz)
@ -81,26 +101,10 @@ public class SelectEoscResultsJobStep1 implements Serializable {
r
.getCollectedfrom()
.stream()
.anyMatch(cf -> cf.getValue().equalsIgnoreCase("B2FIND"))
||
r.getInstance().stream().anyMatch(i -> i.getHostedby().getValue().equalsIgnoreCase("ARCHE")) ||
r
.getInstance()
.stream()
.anyMatch(i -> i.getHostedby().getValue().equalsIgnoreCase("LINDAT/CLARIN repository"))
||
r
.getInstance()
.stream()
.anyMatch(
i -> i
.getHostedby()
.getValue()
.equalsIgnoreCase("Publications at Bielefeld University"))))
.anyMatch(cf -> cf.getValue().equalsIgnoreCase("B2FIND"))))
.map(
(MapFunction<R, Result>) r -> (Result) ResultMapper
.map(r, communityMap),
.map(r, communityMap, df),
Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)

View File

@ -0,0 +1,32 @@
[
{
"paramName": "pu",
"paramLongName": "postgresUrl",
"paramDescription": "the jdbc url to the postgres",
"paramRequired": true
},
{
"paramName": "uid",
"paramLongName": "postgresUser",
"paramDescription": "the postgres user",
"paramRequired": true
},
{
"paramName": "pwd",
"paramLongName": "postgresPassword",
"paramDescription": "the postgres password=",
"paramRequired": true
},
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the target path on HDFS",
"paramRequired": true
},
{
"paramName": "nn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the HDFS nameNode",
"paramRequired": true
}
]

View File

@ -97,19 +97,44 @@
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</java>
<ok to="find_datasource_eosc_id"/>
<ok to="get_ds_master_duplicate"/>
<error to="Kill"/>
</action>
<action name="find_datasource_eosc_id">
<action name="get_ds_master_duplicate">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.eosc.EoscDatasourceId</main-class>
<arg>--outputPath</arg><arg>${workingDir}/eoscDatasourceIds</arg>
<arg>--inputPath</arg><arg>${sourcePath}</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<main-class>eu.dnetlib.dhp.oa.graph.dump.eosc.EoscMasterDuplicate</main-class>
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
</java>
<ok to="fork_dump_eosc_result"/>
<error to="Kill"/>
</action>
<!-- <action name="find_datasource_eosc_id">-->
<!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
<!-- <master>yarn</master>-->
<!-- <mode>cluster</mode>-->
<!-- <name>Dump Publication For EOSC </name>-->
<!-- <class>eu.dnetlib.dhp.oa.graph.dump.eosc.EoscDatasourceId</class>-->
<!-- <jar>dump-${projectVersion}.jar</jar>-->
<!-- <spark-opts>-->
<!-- &#45;&#45;executor-memory=${sparkExecutorMemory}-->
<!-- &#45;&#45;executor-cores=${sparkExecutorCores}-->
<!-- &#45;&#45;driver-memory=${sparkDriverMemory}-->
<!-- &#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
<!-- &#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
<!-- &#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
<!-- &#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
<!-- &#45;&#45;conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}-->
<!-- </spark-opts>-->
<!-- <arg>&#45;&#45;sourcePath</arg><arg>${workingDir}/masterduplicate</arg>-->
<!-- <arg>&#45;&#45;outputPath</arg><arg>${workingDir}/eosc/datasourceids</arg>-->
<!-- </spark>-->
<!-- <ok to="fork_dump_eosc_result"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<fork name="fork_dump_eosc_result">
<path start="dump_eosc_publication"/>
@ -139,6 +164,7 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/publication</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<arg>--eoscDatasourceIdsPath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="extend_eosc_publication"/>
<error to="Kill"/>
@ -214,6 +240,7 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/dataset</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<arg>--eoscDatasourceIdsPath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="extend_eosc_dataset"/>
<error to="Kill"/>
@ -289,6 +316,7 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<arg>--eoscDatasourceIdsPath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="extend_eosc_orp"/>
<error to="Kill"/>
@ -364,6 +392,7 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/dump/software</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<arg>--eoscDatasourceIdsPath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="extend_eosc_software"/>
<error to="Kill"/>

View File

@ -0,0 +1,24 @@
[
{
"paramName":"sp",
"paramLongName":"sourcePath",
"paramDescription": "the path where to find the result",
"paramRequired": true
},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "the path where the relations are stored",
"paramRequired": false
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the path where to store the results",
"paramRequired": true
}
]

View File

@ -29,6 +29,12 @@
"paramLongName":"communityMapPath",
"paramDescription": "The path to the community map",
"paramRequired": true
},
{
"paramName":"edip",
"paramLongName":"eoscDatasourceIdsPath",
"paramDescription": "The path to the community map",
"paramRequired": true
}
]

View File

@ -85,13 +85,16 @@ public class SelectEoscResultTest {
final String cmp = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
.getPath();
final String mdp = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/working/masterduplicate")
.getPath();
SelectEoscResultsJobStep1.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/publication",
"-sourcePath", sourcePath,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-communityMapPath", cmp
"-communityMapPath", cmp,
"-eoscDatasourceIdsPath", mdp
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
@ -109,6 +112,8 @@ public class SelectEoscResultTest {
.filter(r -> Optional.ofNullable(r.getAffiliation()).isPresent() && r.getAffiliation().size() > 0)
.count());
tmp.foreach(r -> System.out.println(new ObjectMapper().writeValueAsString(r)));
}
// "source":"20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff","subRelType":"affiliation","target":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba"

View File

@ -0,0 +1,64 @@
{"eoscId":"eosc.unipd.12d35bb1f56d4b91bb4644faf76d9486","graphId":"10|re3data_____::dba855a84b750cd034f9a2f37d2c10c5","graphName":"Research Data Unipd"}
{"eoscId":"eosc.vilnius-university.1ec069c1620d49d460e4cbcec0af57f6","graphId":"10|re3data_____::238ee7cec914359ecf962a31f04685df","graphName":"MIDAS"}
{"eoscId":"eosc.ccsd.06cdd3ff4700bb4c8e7bf22c14f23f5b","graphId":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","graphName":"Episciences"}
{"eoscId":"eosc.esrf.ecc74ab09791c52aa238ee77ae988874","graphId":"10|fairsharing_::2996962656838a97af4c5f926fe6f1b0","graphName":"European Synchrotron Radiation Facility Data Portal"}
{"eoscId":"eosc.psnc.6f0470e3bb9203ec3a7553f3a72a7a1f","graphId":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","graphName":"ROHub"}
{"eoscId":"eosc.gwdg.d6521479ffa922bbccc839606b8ec7c5","graphId":"10|re3data_____::f52792889d64d1a688b43ed989f6464a","graphName":"TextGrid Repository"}
{"eoscId":"eosc.cessda-eric.7e17e8817404ce7a8013be373723b2be","graphId":"10|re3data_____::bcf017a6702b1ff5fcaccca1ca44c010","graphName":"CESSDA Data Catalogue"}
{"eoscId":"eosc.lindatclariah-cz.6dc98fcb5294282acf3d92f3ab3376b2","graphId":"10|re3data_____::a507cdacc5bbcc08761c92185dee5cab","graphName":"LINDAT/CLARIN repository"}
{"eoscId":"eosc.ceric-eric.e9354332fd75190b935b80c1ba30b837","graphId":"10|re3data_____::e79f66fcea0a894d66ecff3da5e52311","graphName":"CERIC Data Portal"}
{"eoscId":"eosc.dtu.sciencedata","graphId":"10|re3data_____::33d41eba0d4e9d43f28d201faa65227d","graphName":"ScienceData"}
{"eoscId":"eosc.hn.02e4d980399d7142506e8aadb2b8e865","graphId":"10|re3data_____::fabe5c1aaa2e2d4c847e01647b87bf60","graphName":"ISIDORE"}
{"eoscId":"eosc.uit.49e8d4cef23bda3b66dd417e6675727d","graphId":"10|re3data_____::ee77795f142f22530033c0677f26925a","graphName":"The Tromsø Repository of Language and Linguistics (TROLLing)"}
{"eoscId":"eosc.ror-org.24ef0000cfbf3ce7f3a40ba6b87e76ce","graphId":"10|openaire____::d6b01d836828249d33006d954b9b8fef","graphName":"Research Organization Registry (ROR)"}
{"eoscId":"eosc.eudat.b2find","graphId":"10|re3data_____::730f562f9efe8a3b3742d2da510d4335","graphName":"B2FIND"}
{"eoscId":"eosc.cnr_-_isti.dbe89d2b83f3e29caab7923a51c1d151","graphId":"10|opendoar____::81930c54e08b6d26d9638dd2e4656dc1","graphName":"ISTI Open Portal"}
{"eoscId":"ni4os.nsl-ge.digital_repository_of_georgian_scientific_works","graphId":"10|eurocrisdris::e4e613eee91d199e1771cc3211412d8d","graphName":"Digital Repository of Georgian Scientific Works"}
{"eoscId":"eosc.clarin-eric.2aad8ade139792a49b130b539e1bb144","graphId":"10|fairsharing_::7d265aa7147bd3913fb84c7963a209d1","graphName":"Virtual Language Observatory"}
{"eoscId":"eosc.csuc.135887d3dea4b6723095d13c28dd52a3","graphId":"10|re3data_____::12afc40bb115d19b871acd8889b79340","graphName":"CORA. Repositori de Dades de Recerca"}
{"eoscId":"eosc.european_xfel.european_xfel_metadata_catalogue","graphId":"10|re3data_____::3b3687361f2a742aadf47284b64822c8","graphName":"European XFEL Data Portal"}
{"eoscId":"eosc.icos_eric.25c5f3f0674fb287e05e697263e211e2","graphId":"10|re3data_____::01157446e93d2515c2f1bed5fbfe6502","graphName":"ICOS Carbon Portal"}
{"eoscId":"eosc.fris.8f42bfccf70de38b01763b704300f882","graphId":"10|eurocrisdris::15aa175cd2b1390833bae20f3d7b7719","graphName":"Flemish Research Information Space"}
{"eoscId":"eosc.embl-ebi.e29a4e098afa05818957179f05d8e21d","graphId":"10|fairsharing_::52947e0ade57a09e4a1386d08f17b656","graphName":"Identifiers.org Central Registry"}
{"eoscId":"eosc.cyfronet.b59c2171d05ed9fb9e70a86d544f42a3","graphId":"10|re3data_____::4aec7545cc10a21a277de10d87e64028","graphName":"RODBUK Cracow Open Research Data Repository"}
{"eoscId":"eosc.scipedia.0063745e5964b19c3e9ceeb2bd6632f5","graphId":"10|opendoar____::b8a03c5c15fcfa8dae0b03351eb1742f","graphName":"Scipedia"}
{"eoscId":"eosc.eudat.17bb7bb8ef1af0f9bdb55a7db30cfa8a","graphId":"10|re3data_____::ad3609c351bd520edf6f10f5e0d9b877","graphName":"B2SHARE"}
{"eoscId":"eosc.openaire.openapc","graphId":"10|apc_________::e2b1600b229fc30663c8a1f662debddf","graphName":"OpenAPC Global Initiative"}
{"eoscId":"eosc.kit.re3data_-_registry_of_research_data_repositories","graphId":"10|openaire____::21f8a223b9925c2f87c404096080b046","graphName":"Registry of Research Data Repository"}
{"eoscId":"eosc.eudat.9168f179ffab97584bf99a2729837545","graphId":"10|re3data_____::a632666349a0bb9a36096c9e152d34cc","graphName":"B2SAFE"}
{"eoscId":"eosc.taltechdata.tallinn_university_of_technology_data_repository","graphId":"10|opendoar____::acb3a881c7ce9abcae0ce8c99c86a906","graphName":"TalTechData"}
{"eoscId":"eosc.oxford_e-research_centre.21697de1a5b10b8eb5fad857edecf5c9","graphId":"10|openaire____::bf5a61cc330e21ffa90eed3eb1533466","graphName":"FAIRsharing"}
{"eoscId":"eosc.materialscloud.materials_cloud_archive","graphId":"10|fairsharing_::a431d70133ef6cf688bc4f6093922b48","graphName":"Materials Cloud"}
{"eoscId":"eosc.nilu.actris_data_portal","graphId":"10|fairsharing_::5a01f0597ac4bdf35c24846734ee9a76","graphName":"ACTRIS Data Centre"}
{"eoscId":"eosc.csc-fi.fairdata_services","graphId":"10|opendoar____::5699ea73cda4c69b17c2255ec26db204","graphName":"Fairdata Services"}
{"eoscId":"eosc.inria.5923d0f31f0acda46cf4b592972284a2","graphId":"10|openaire____::dbfd07503aaa1ed31beed7dec942f3f4","graphName":"Software Heritage"}
{"eoscId":"eosc.acdh-ch.3b0149bee976d6db7eef053159e97a87","graphId":"10|re3data_____::69162d0a40bab7cc80b40ec90da874b9","graphName":"ARCHE"}
{"eoscId":"eosc.rli.661cdfdc74561b8eb69583b8137799d2","graphId":"10|fairsharing_::0cbed40c0d920b94126eaf5e707be1f5","graphName":"Open Energy Platform"}
{"eoscId":"eosc.bbmri-eric.314cee7546a7489c2cc3ab79d34e2640","graphId":"10|fairsharing_::4491777b1aa8b5b32c2e8666dbe1a495","graphName":"BBMRI-ERIC Directory"}
{"eoscId":"eosc.unibi-ub.a61d9ea844bdf43e6feabd6b14dfe3c5","graphId":"10|opendoar____::229754d7799160502a143a72f6789927","graphName":"Publications at Bielefeld University"}
{"eoscId":"eosc.ku_leuven.68bf19ae7ee1bc7e3872255e96550c04","graphId":"10|eurocrisdris::64e793796c88e19742bffb84023872cf","graphName":"Leuven Institutional Repository and Information Archiving System(LIRIAS)"}
{"eoscId":"eosc.olos.olos","graphId":"10|re3data_____::3b7cffda8156920b25a5f1831537f930","graphName":"OLOS"}
{"eoscId":"eosc.wenmr.d288225c333b07fc9d001da5c5392741","graphId":"10|fairsharing_::acab0116c354964a558e65bdd07ff047","graphName":"MetalPDB"}
{"eoscId":"eosc.zpid.b96341f00ca4c3a314abcc07fc0084b2","graphId":"10|fairsharing_::f9ff6540c092abd6a77908c034710a04","graphName":"PsychArchives"}
{"eoscId":"eosc.dsmz.bacdive__the_bacterial_diversity_metadatabase","graphId":"10|fairsharing_::a8345c3bb9e3896ea538ce77ffaf2c20","graphName":"Bacterial Diversity Metadatabase"}
{"eoscId":"eosc.gesis.doi_registration_service","graphId":"10|fairsharing_::b5baa9c23ac3e015ad287b17a3d4afa3","graphName":"da|ra"}
{"eoscId":"ni4os.grnet.ni4os-europe_repository_service","graphId":"10|opendoar____::ad80947c9909dd9d70739ca2b8f3fd2d","graphName":"NI4OS Europe Repository"}
{"eoscId":"eosc.charite_bih_brain_simulation.vre","graphId":"10|fairsharing_::159c1ffe5b61b41b3c4d8f4c2150f6c4","graphName":"Virtual Research Environment"}
{"eoscId":"eosc.riga_stradins_university.4ea61809e753e65a459bbe4a492c773b","graphId":"10|re3data_____::fd32b03d9f88de45c1ac672f8ccf4071","graphName":"Riga Stradins University dataverse"}
{"eoscId":"eosc.dkrz.9ffffb05aaf22e7f9138dca4560a8c8b","graphId":"10|re3data_____::d18674e211f46fb9e49b1d3122a9c01e","graphName":"World Data Center for Climate at DKRZ"}
{"eoscId":"eosc.vamdc.c967f669aa354e584e6786ee1d0c823e","graphId":"10|re3data_____::f7fca0a726f12b706e54c309d94364f9","graphName":"VAMDC Portal"}
{"eoscId":"eosc.psi.f1a79f572f95bc2fbea5cdc40ef4eb22","graphId":"10|re3data_____::1e55174ff77ed2d804871281201dbb50","graphName":"PSI Open Data Provider"}
{"eoscId":"eosc.openaire.2bb8710e1870170a175110615698e677","graphId":"10|openaire____::e034d6a11054f5ade9221ebac484e864","graphName":"ScholExplorer"}
{"eoscId":"eosc.elixir-uk.5126ffcc8e23f65bbbe219d36128f2c8","graphId":"10|fairsharing_::c8cd63e1bf13c5016881652983fb615a","graphName":"WorkflowHub"}
{"eoscId":"eosc.gdansk_tech.1434de11c83986b5be5592677f28d171","graphId":"10|re3data_____::b099df0bc21422bd4288486c034b43de","graphName":"Most Wiedzy Open Research Data Catalog"}
{"eoscId":"eosc.gbif.14ac40283813a624bd74ae82605ded23","graphId":"10|re3data_____::194f60618405f8d2dc58ea68d968a104","graphName":"Global Biodiversity Information Facility"}
{"eoscId":"eosc.vliz.61c6dae33d794d477e6a68ed43f52eb3","graphId":"10|fairsharing_::c6036a69be21cb660499b75718a3ef24","graphName":"World Register of Marine Species"}
{"eoscId":"eosc.ku_leuven.1cb0937dc41e70d8126d7b259ad470af","graphId":"10|re3data_____::35fe5e9a5f59f9491ee04381119c5e3b","graphName":"KU Leuven RDR"}
{"eoscId":"eosc.cern.8025243fa3c887159fc9b3930ae147c2","graphId":"10|re3data_____::ac9ac6a041c4159d498591638b47009e","graphName":"CERN Open Data"}
{"eoscId":"eosc.lapp.ef0bb7d889d0cced364444495f7a1e67","graphId":"10|re3data_____::c19518b015a3941a3e0675d398ca33f6","graphName":"Open-source Scientific Software and Service Repository"}
{"eoscId":"eosc.awi_bremerhaven.2882af227241cb956c28fe321a70dfb2","graphId":"10|re3data_____::9633d1e8c4309c833c2c442abeb0cfeb","graphName":"PANGAEA - Data Publisher for Earth and Environmental Science"}
{"eoscId":"eosc.inoe_2000.infra-art_spectral_library","graphId":"10|re3data_____::261b6dae636dee9f7fbd91b686ec9835","graphName":"INFRA-ART Spectral Library"}
{"eoscId":"eosc.ill.d422cba59746f39d10bdfea5c9cf8511","graphId":"10|re3data_____::ade37d060405975243eab9ce3f7aae78","graphName":"ILL Data Portal"}
{"eoscId":"eosc.hits.901e9baaa76d72017ebd7dfd93436caf","graphId":"10|fairsharing_::0233f3bb964cf325a30f8b1c2ed2da93","graphName":"FAIRDOMHub"}
{"eoscId":"eosc.openaire.0a02f13310296033694acead588a773b","graphId":"10|opendoar____::358aee4cc897452c00244351e4d91f69","graphName":"ZENODO"}
{"eoscId":"eosc.lifewatch-eric.ecoportal","graphId":"10|fairsharing_::9e6a921fbc428b5638b3986e365d4f21","graphName":"Lifewatch ERIC EcoPortal"}