diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Instance.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Instance.java index 3864b3f..d094ebf 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Instance.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Instance.java @@ -66,6 +66,16 @@ public class Instance implements Serializable { private String fulltext; + private List eoscDsId; + + public List getEoscDsId() { + return eoscDsId; + } + + public void setEoscDsId(List eoscId) { + this.eoscDsId = eoscId; + } + public String getFulltext() { return fulltext; } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java index f2149bb..7dcaa6b 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/DumpProducts.java @@ -74,7 +74,7 @@ public class DumpProducts implements Serializable { return null; } - return ResultMapper.map(value, communityMap); + return ResultMapper.map(value, communityMap, null); } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java index b844bf0..d483c36 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java @@ -6,6 +6,11 @@ import java.util.*; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,6 +23,7 @@ import eu.dnetlib.dhp.eosc.model.Measure; import eu.dnetlib.dhp.eosc.model.OpenAccessRoute; import eu.dnetlib.dhp.eosc.model.Provenance; import eu.dnetlib.dhp.eosc.model.Result; +import eu.dnetlib.dhp.oa.graph.dump.eosc.MasterDuplicate; import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException; import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -27,9 +33,11 @@ public class ResultMapper implements Serializable { private static final Logger log = LoggerFactory.getLogger(ResultMapper.class); public static Result map( - E in, Map communityMap) + E in, Map communityMap, + List eoscIds) throws NoAvailableEntityTypeException, CardinalityTooHighException { + log.info("*****************" + eoscIds.size()); Result out = new Result(); eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in; @@ -49,7 +57,7 @@ public class ResultMapper implements Serializable { mapFormat(out, input); out.setId(input.getId()); mapOriginalId(out, input); - mapInstance(out, input); + mapInstance(out, input, eoscIds); mapLamguage(out, input); mapLastUpdateTimestamp(out, input); mapTitle(out, input); @@ -255,7 +263,8 @@ public class ResultMapper implements Serializable { } } - private static void mapInstance(Result out, eu.dnetlib.dhp.schema.oaf.Result input) { + private static void mapInstance(Result out, eu.dnetlib.dhp.schema.oaf.Result input, + List eoscIds) { if (Optional .ofNullable(input.getInstance()) .isPresent()) { @@ -264,7 +273,7 @@ public class ResultMapper implements Serializable { input .getInstance() .stream() - .map(i -> getCommunityInstance(i, input.getResulttype().getClassid())) + .map(i -> getCommunityInstance(i, input.getResulttype().getClassid(), eoscIds)) .collect(Collectors.toList())); } @@ -538,7 +547,7 @@ public class ResultMapper implements Serializable { } private static eu.dnetlib.dhp.eosc.model.Instance getCommunityInstance(eu.dnetlib.dhp.schema.oaf.Instance i, - String resultType) { + String resultType, List eoscIds) { eu.dnetlib.dhp.eosc.model.Instance instance = new eu.dnetlib.dhp.eosc.model.Instance(); setCommonValue(i, instance); @@ -547,6 +556,27 @@ public class ResultMapper implements Serializable { .setHostedby( CfHbKeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue())); + List eoscDsIds = eoscIds + .stream() + .filter( + dm -> dm + .getGraphId() + .equals(i.getHostedby().getKey()) || + dm + .getGraphId() + .equals(i.getCollectedfrom().getKey())) + .collect(Collectors.toList()); + + if (eoscDsIds.size() > 0) { + instance + .setEoscDsId( + eoscDsIds + .stream() + .map(dm -> dm.getEoscId()) + .collect(Collectors.toList())); + + } + if (resultType.equals("publication") || resultType.equals("other")) { if (Optional.ofNullable(i.getFulltext()).isPresent()) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/EoscDatasourceId.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/EoscDatasourceId.java index dcf8364..32a8ec8 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/EoscDatasourceId.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/EoscDatasourceId.java @@ -1,19 +1,8 @@ + package eu.dnetlib.dhp.oa.graph.dump.eosc; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Datasource; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import javax.rmi.CORBA.Util; import java.io.BufferedWriter; import java.io.IOException; import java.io.OutputStreamWriter; @@ -21,63 +10,93 @@ import java.io.Serializable; import java.nio.charset.StandardCharsets; import java.util.Optional; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import javax.rmi.CORBA.Util; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.catalyst.plans.logical.ColumnStat; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Datasource; /** * @author miriam.baglioni * @Date 20/09/23 */ public class EoscDatasourceId implements Serializable { - private static final Logger log = LoggerFactory.getLogger(EoscDatasourceId.class); - private static String EOSC = "10|openaire____::2e06c1122c7df43765fdcf91080824fa"; - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - SaveCommunityMap.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/eosc_identifiers_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + private static final Logger log = LoggerFactory.getLogger(EoscDatasourceId.class); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SaveCommunityMap.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/eosc_identifiers_parameters.json")); - final String nameNode = parser.get("nameNode"); - log.info("nameNode: {}", nameNode); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - Utils.removeOutputDir(spark, outputPath); - mapEoscIdentifier(spark, inputPath, outputPath); - }); - } + SparkConf conf = new SparkConf(); - private static void mapEoscIdentifier(SparkSession spark, String inputPath, String outputPath){ - spark.read() - .schema(Encoders.bean(Datasource.class).schema()) - .json(inputPath + "/datasource") - .withColumn("collfrom", functions.explode(new Column("collectedfrom.key"))) - .filter("collfrom == " + EOSC) - .withColumn("orId", functions.explode(new Column("originalId"))) - .filter(new Column("orId").startsWith("eosc")) - .select( - new Column("id").as("graphId"), - new Column("orId").as("eoscId")); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath); + mapEoscIdentifier(spark, inputPath, outputPath); + }); + } - } + private static void mapEoscIdentifier(SparkSession spark, String inputPath, String outputPath) { + final StructType structureSchema = new StructType() + .add("masterId", DataTypes.StringType) + .add("masterName", DataTypes.StringType) + .add("duplicateId", DataTypes.StringType); + + org.apache.spark.sql.Dataset df = spark.read().schema(structureSchema).parquet(inputPath); + +// spark +// .read() +// .schema(Encoders.bean(Datasource.class).schema()) +// .json(inputPath + "/datasource") +// .withColumn("orId", functions.explode(new Column("originalId"))) + df + .filter(new Column("duplicateId").startsWith("eosc")) + .select( + new Column("masterId").as("graphId"), + functions.substring_index(new Column("duplicateId"), "::", -1).as("eoscId")) + .filter((FilterFunction) r -> !((String) r.getAs("graphId")).startsWith(("10|eosc"))) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .save(outputPath); + + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/EoscMasterDuplicate.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/EoscMasterDuplicate.java new file mode 100644 index 0000000..1e5d36b --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/EoscMasterDuplicate.java @@ -0,0 +1,47 @@ + +package eu.dnetlib.dhp.oa.graph.dump.eosc; + +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * @author miriam.baglioni + * @Date 25/09/23 + */ +public class EoscMasterDuplicate { + + private static final Logger log = LoggerFactory.getLogger(EoscMasterDuplicate.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + EoscMasterDuplicate.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/datasourcemaster_parameters.json"))); + + parser.parseArgument(args); + + final String dbUrl = parser.get("postgresUrl"); + log.info("postgresUrl: {}", dbUrl); + + final String dbUser = parser.get("postgresUser"); + log.info("postgresUser: {}", dbUser); + + final String dbPassword = parser.get("postgresPassword"); + log.info("postgresPassword: {}", dbPassword); + + final String hdfsPath = parser.get("hdfsPath"); + log.info("hdfsPath: {}", hdfsPath); + + final String hdfsNameNode = parser.get("hdfsNameNode"); + log.info("hdfsNameNode: {}", hdfsNameNode); + + ReadDatasourceMasterDuplicateFromDB.execute(dbUrl, dbUser, dbPassword, hdfsPath, hdfsNameNode); + + } + +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/MasterDuplicate.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/MasterDuplicate.java new file mode 100644 index 0000000..90e5e3c --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/MasterDuplicate.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.oa.graph.dump.eosc; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 25/09/23 + */ +public class MasterDuplicate implements Serializable { + private String eoscId; + private String graphId; + private String graphName; + + public String getEoscId() { + return eoscId; + } + + public void setEoscId(String eoscId) { + this.eoscId = eoscId; + } + + public String getGraphId() { + return graphId; + } + + public void setGraphId(String graphId) { + this.graphId = graphId; + } + + public String getGraphName() { + return graphName; + } + + public void setGraphName(String graphName) { + this.graphName = graphName; + } +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ReadDatasourceMasterDuplicateFromDB.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ReadDatasourceMasterDuplicateFromDB.java new file mode 100644 index 0000000..fc438ef --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ReadDatasourceMasterDuplicateFromDB.java @@ -0,0 +1,128 @@ +// +// Source code recreated from a .class file by IntelliJ IDEA +// (powered by FernFlower decompiler) +// + +package eu.dnetlib.dhp.oa.graph.dump.eosc; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import java.sql.ResultSet; +import java.sql.SQLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.common.DbClient; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; + +public class ReadDatasourceMasterDuplicateFromDB { + private static final Logger log = LoggerFactory.getLogger(ReadDatasourceMasterDuplicateFromDB.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);"; + + public ReadDatasourceMasterDuplicateFromDB() { + } + + public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode) + throws IOException { + int count = 0; + DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword); + Throwable var7 = null; + + try { + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + FileSystem fileSystem = FileSystem.get(conf); + FSDataOutputStream fos = fileSystem.create(new Path(hdfsPath)); + log + .info( + "running query: {}", + "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);"); + log.info("storing results in: {}", hdfsPath); + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8)); + Throwable var12 = null; + + try { + dbClient + .processResults( + "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);", + (rs) -> { + writeMap(datasourceMasterMap(rs), writer); + }); + ++count; + } catch (Throwable var35) { + var12 = var35; + throw var35; + } finally { + if (writer != null) { + if (var12 != null) { + try { + writer.close(); + } catch (Throwable var34) { + var12.addSuppressed(var34); + } + } else { + writer.close(); + } + } + + } + } catch (Throwable var37) { + var7 = var37; + throw var37; + } finally { + if (dbClient != null) { + if (var7 != null) { + try { + dbClient.close(); + } catch (Throwable var33) { + var7.addSuppressed(var33); + } + } else { + dbClient.close(); + } + } + + } + + return count; + } + + private static MasterDuplicate datasourceMasterMap(ResultSet rs) { + try { + MasterDuplicate md = new MasterDuplicate(); + String duplicateId = rs.getString("duplicateId"); + String masterId = rs.getString("masterId"); + String masterName = rs.getString("masterName"); + if (duplicateId.startsWith("eosc")) { + md.setEoscId(duplicateId.substring(duplicateId.lastIndexOf("::") + 2)); + md.setGraphId(OafMapperUtils.createOpenaireId(10, masterId, true)); + md.setGraphName(masterName); + return md; + } + return null; + } catch (SQLException var5) { + throw new RuntimeException(var5); + } + } + + private static void writeMap(MasterDuplicate dm, BufferedWriter writer) { + if (dm == null) + return; + try { + writer.write(OBJECT_MAPPER.writeValueAsString(dm)); + writer.newLine(); + } catch (IOException var3) { + throw new RuntimeException(var3); + } + } +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java index 1268d6a..631058a 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java @@ -4,15 +4,16 @@ package eu.dnetlib.dhp.oa.graph.dump.eosc; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.*; +import org.apache.spark.sql.types.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,6 +50,9 @@ public class SelectEoscResultsJobStep1 implements Serializable { final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); + final String eoscDatasourceIdsPath = parser.get("eoscDatasourceIdsPath"); + log.info("eoscDatasourceIdsPath: {}", eoscDatasourceIdsPath); + final String communityMapPath = parser.get("communityMapPath"); log.info("communityMapPath: {}", communityMapPath); @@ -65,13 +69,29 @@ public class SelectEoscResultsJobStep1 implements Serializable { isSparkSessionManaged, spark -> { Utils.removeOutputDir(spark, outputPath); - selectEoscResults(spark, inputPath, outputPath, inputClazz, communityMapPath); + selectEoscResults(spark, inputPath, outputPath, inputClazz, communityMapPath, eoscDatasourceIdsPath); }); } private static void selectEoscResults(SparkSession spark, String inputPath, String outputPath, - Class inputClazz, String communityMapPath) { + Class inputClazz, String communityMapPath, String eoscDatasourceIdsPath) { + +// final StructType structureSchema = new StructType() +// .add("eoscId", DataTypes.StringType) +// .add("graphId", DataTypes.StringType) +// .add("graphName", DataTypes.StringType); +// +// // .fromDDL("`graphId`: STRING, `eoscId`:STRING"); +// org.apache.spark.sql.Dataset df = spark +// .read() +// .schema(structureSchema) +// .json(eoscDatasourceIdsPath); + + List df = Utils + .readPath(spark, eoscDatasourceIdsPath, MasterDuplicate.class) + .collectAsList(); + log.info("number of rows ************: " + df.size()); CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath); Utils .readPath(spark, inputPath, inputClazz) @@ -81,26 +101,10 @@ public class SelectEoscResultsJobStep1 implements Serializable { r .getCollectedfrom() .stream() - .anyMatch(cf -> cf.getValue().equalsIgnoreCase("B2FIND")) - || - r.getInstance().stream().anyMatch(i -> i.getHostedby().getValue().equalsIgnoreCase("ARCHE")) || - r - .getInstance() - .stream() - .anyMatch(i -> i.getHostedby().getValue().equalsIgnoreCase("LINDAT/CLARIN repository")) - || - r - .getInstance() - .stream() - .anyMatch( - i -> i - .getHostedby() - .getValue() - .equalsIgnoreCase("Publications at Bielefeld University")))) - + .anyMatch(cf -> cf.getValue().equalsIgnoreCase("B2FIND")))) .map( (MapFunction) r -> (Result) ResultMapper - .map(r, communityMap), + .map(r, communityMap, df), Encoders.bean(Result.class)) .write() .mode(SaveMode.Overwrite) diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/datasourcemaster_parameters.json b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/datasourcemaster_parameters.json new file mode 100644 index 0000000..fbe2cca --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/datasourcemaster_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "pu", + "paramLongName": "postgresUrl", + "paramDescription": "the jdbc url to the postgres", + "paramRequired": true + }, + { + "paramName": "uid", + "paramLongName": "postgresUser", + "paramDescription": "the postgres user", + "paramRequired": true + }, + { + "paramName": "pwd", + "paramLongName": "postgresPassword", + "paramDescription": "the postgres password=", + "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "hdfsPath", + "paramDescription": "the target path on HDFS", + "paramRequired": true + }, + { + "paramName": "nn", + "paramLongName": "hdfsNameNode", + "paramDescription": "the HDFS nameNode", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml index 51c14c5..2e354d3 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml @@ -97,19 +97,44 @@ --nameNode${nameNode} --isLookUpUrl${isLookUpUrl} - + - + - eu.dnetlib.dhp.oa.graph.dump.eosc.EoscDatasourceId - --outputPath${workingDir}/eoscDatasourceIds - --inputPath${sourcePath} - --nameNode${nameNode} + eu.dnetlib.dhp.oa.graph.dump.eosc.EoscMasterDuplicate + --postgresUrl${postgresURL} + --postgresUser${postgresUser} + --postgresPassword${postgresPassword} + --hdfsPath${workingDir}/masterduplicate + --hdfsNameNode${nameNode} + + + + + + + + + + + + + + + + + + + + + + + @@ -139,6 +164,7 @@ --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication --outputPath${workingDir}/dump/publication --communityMapPath${workingDir}/communityMap + --eoscDatasourceIdsPath${workingDir}/masterduplicate @@ -214,6 +240,7 @@ --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset --outputPath${workingDir}/dump/dataset --communityMapPath${workingDir}/communityMap + --eoscDatasourceIdsPath${workingDir}/masterduplicate @@ -289,6 +316,7 @@ --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --outputPath${workingDir}/dump/otherresearchproduct --communityMapPath${workingDir}/communityMap + --eoscDatasourceIdsPath${workingDir}/masterduplicate @@ -364,6 +392,7 @@ --resultTableNameeu.dnetlib.dhp.schema.oaf.Software --outputPath${workingDir}/dump/software --communityMapPath${workingDir}/communityMap + --eoscDatasourceIdsPath${workingDir}/masterduplicate diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_identifiers_parameters.json b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_identifiers_parameters.json new file mode 100644 index 0000000..0447ca0 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_identifiers_parameters.json @@ -0,0 +1,24 @@ +[ + + { + "paramName":"sp", + "paramLongName":"sourcePath", + "paramDescription": "the path where to find the result", + "paramRequired": true + }, + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "the path where the relations are stored", + "paramRequired": false + }, + { + "paramName": "op", + "paramLongName": "outputPath", + "paramDescription": "the path where to store the results", + "paramRequired": true + } +] + + + diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json index 605db73..098495d 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json @@ -29,6 +29,12 @@ "paramLongName":"communityMapPath", "paramDescription": "The path to the community map", "paramRequired": true + }, + { + "paramName":"edip", + "paramLongName":"eoscDatasourceIdsPath", + "paramDescription": "The path to the community map", + "paramRequired": true } ] diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultTest.java index 679653e..0b7fd5b 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultTest.java @@ -85,13 +85,16 @@ public class SelectEoscResultTest { final String cmp = getClass() .getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json") .getPath(); - + final String mdp = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/working/masterduplicate") + .getPath(); SelectEoscResultsJobStep1.main(new String[] { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-outputPath", workingDir.toString() + "/publication", "-sourcePath", sourcePath, "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", - "-communityMapPath", cmp + "-communityMapPath", cmp, + "-eoscDatasourceIdsPath", mdp }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); @@ -109,6 +112,8 @@ public class SelectEoscResultTest { .filter(r -> Optional.ofNullable(r.getAffiliation()).isPresent() && r.getAffiliation().size() > 0) .count()); + tmp.foreach(r -> System.out.println(new ObjectMapper().writeValueAsString(r))); + } // "source":"20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff","subRelType":"affiliation","target":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba" diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/working/masterduplicate b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/working/masterduplicate new file mode 100644 index 0000000..47b3dc9 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/working/masterduplicate @@ -0,0 +1,64 @@ +{"eoscId":"eosc.unipd.12d35bb1f56d4b91bb4644faf76d9486","graphId":"10|re3data_____::dba855a84b750cd034f9a2f37d2c10c5","graphName":"Research Data Unipd"} +{"eoscId":"eosc.vilnius-university.1ec069c1620d49d460e4cbcec0af57f6","graphId":"10|re3data_____::238ee7cec914359ecf962a31f04685df","graphName":"MIDAS"} +{"eoscId":"eosc.ccsd.06cdd3ff4700bb4c8e7bf22c14f23f5b","graphId":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","graphName":"Episciences"} +{"eoscId":"eosc.esrf.ecc74ab09791c52aa238ee77ae988874","graphId":"10|fairsharing_::2996962656838a97af4c5f926fe6f1b0","graphName":"European Synchrotron Radiation Facility Data Portal"} +{"eoscId":"eosc.psnc.6f0470e3bb9203ec3a7553f3a72a7a1f","graphId":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","graphName":"ROHub"} +{"eoscId":"eosc.gwdg.d6521479ffa922bbccc839606b8ec7c5","graphId":"10|re3data_____::f52792889d64d1a688b43ed989f6464a","graphName":"TextGrid Repository"} +{"eoscId":"eosc.cessda-eric.7e17e8817404ce7a8013be373723b2be","graphId":"10|re3data_____::bcf017a6702b1ff5fcaccca1ca44c010","graphName":"CESSDA Data Catalogue"} +{"eoscId":"eosc.lindatclariah-cz.6dc98fcb5294282acf3d92f3ab3376b2","graphId":"10|re3data_____::a507cdacc5bbcc08761c92185dee5cab","graphName":"LINDAT/CLARIN repository"} +{"eoscId":"eosc.ceric-eric.e9354332fd75190b935b80c1ba30b837","graphId":"10|re3data_____::e79f66fcea0a894d66ecff3da5e52311","graphName":"CERIC Data Portal"} +{"eoscId":"eosc.dtu.sciencedata","graphId":"10|re3data_____::33d41eba0d4e9d43f28d201faa65227d","graphName":"ScienceData"} +{"eoscId":"eosc.hn.02e4d980399d7142506e8aadb2b8e865","graphId":"10|re3data_____::fabe5c1aaa2e2d4c847e01647b87bf60","graphName":"ISIDORE"} +{"eoscId":"eosc.uit.49e8d4cef23bda3b66dd417e6675727d","graphId":"10|re3data_____::ee77795f142f22530033c0677f26925a","graphName":"The Tromsø Repository of Language and Linguistics (TROLLing)"} +{"eoscId":"eosc.ror-org.24ef0000cfbf3ce7f3a40ba6b87e76ce","graphId":"10|openaire____::d6b01d836828249d33006d954b9b8fef","graphName":"Research Organization Registry (ROR)"} +{"eoscId":"eosc.eudat.b2find","graphId":"10|re3data_____::730f562f9efe8a3b3742d2da510d4335","graphName":"B2FIND"} +{"eoscId":"eosc.cnr_-_isti.dbe89d2b83f3e29caab7923a51c1d151","graphId":"10|opendoar____::81930c54e08b6d26d9638dd2e4656dc1","graphName":"ISTI Open Portal"} +{"eoscId":"ni4os.nsl-ge.digital_repository_of_georgian_scientific_works","graphId":"10|eurocrisdris::e4e613eee91d199e1771cc3211412d8d","graphName":"Digital Repository of Georgian Scientific Works"} +{"eoscId":"eosc.clarin-eric.2aad8ade139792a49b130b539e1bb144","graphId":"10|fairsharing_::7d265aa7147bd3913fb84c7963a209d1","graphName":"Virtual Language Observatory"} +{"eoscId":"eosc.csuc.135887d3dea4b6723095d13c28dd52a3","graphId":"10|re3data_____::12afc40bb115d19b871acd8889b79340","graphName":"CORA. Repositori de Dades de Recerca"} +{"eoscId":"eosc.european_xfel.european_xfel_metadata_catalogue","graphId":"10|re3data_____::3b3687361f2a742aadf47284b64822c8","graphName":"European XFEL Data Portal"} +{"eoscId":"eosc.icos_eric.25c5f3f0674fb287e05e697263e211e2","graphId":"10|re3data_____::01157446e93d2515c2f1bed5fbfe6502","graphName":"ICOS Carbon Portal"} +{"eoscId":"eosc.fris.8f42bfccf70de38b01763b704300f882","graphId":"10|eurocrisdris::15aa175cd2b1390833bae20f3d7b7719","graphName":"Flemish Research Information Space"} +{"eoscId":"eosc.embl-ebi.e29a4e098afa05818957179f05d8e21d","graphId":"10|fairsharing_::52947e0ade57a09e4a1386d08f17b656","graphName":"Identifiers.org Central Registry"} +{"eoscId":"eosc.cyfronet.b59c2171d05ed9fb9e70a86d544f42a3","graphId":"10|re3data_____::4aec7545cc10a21a277de10d87e64028","graphName":"RODBUK Cracow Open Research Data Repository"} +{"eoscId":"eosc.scipedia.0063745e5964b19c3e9ceeb2bd6632f5","graphId":"10|opendoar____::b8a03c5c15fcfa8dae0b03351eb1742f","graphName":"Scipedia"} +{"eoscId":"eosc.eudat.17bb7bb8ef1af0f9bdb55a7db30cfa8a","graphId":"10|re3data_____::ad3609c351bd520edf6f10f5e0d9b877","graphName":"B2SHARE"} +{"eoscId":"eosc.openaire.openapc","graphId":"10|apc_________::e2b1600b229fc30663c8a1f662debddf","graphName":"OpenAPC Global Initiative"} +{"eoscId":"eosc.kit.re3data_-_registry_of_research_data_repositories","graphId":"10|openaire____::21f8a223b9925c2f87c404096080b046","graphName":"Registry of Research Data Repository"} +{"eoscId":"eosc.eudat.9168f179ffab97584bf99a2729837545","graphId":"10|re3data_____::a632666349a0bb9a36096c9e152d34cc","graphName":"B2SAFE"} +{"eoscId":"eosc.taltechdata.tallinn_university_of_technology_data_repository","graphId":"10|opendoar____::acb3a881c7ce9abcae0ce8c99c86a906","graphName":"TalTechData"} +{"eoscId":"eosc.oxford_e-research_centre.21697de1a5b10b8eb5fad857edecf5c9","graphId":"10|openaire____::bf5a61cc330e21ffa90eed3eb1533466","graphName":"FAIRsharing"} +{"eoscId":"eosc.materialscloud.materials_cloud_archive","graphId":"10|fairsharing_::a431d70133ef6cf688bc4f6093922b48","graphName":"Materials Cloud"} +{"eoscId":"eosc.nilu.actris_data_portal","graphId":"10|fairsharing_::5a01f0597ac4bdf35c24846734ee9a76","graphName":"ACTRIS Data Centre"} +{"eoscId":"eosc.csc-fi.fairdata_services","graphId":"10|opendoar____::5699ea73cda4c69b17c2255ec26db204","graphName":"Fairdata Services"} +{"eoscId":"eosc.inria.5923d0f31f0acda46cf4b592972284a2","graphId":"10|openaire____::dbfd07503aaa1ed31beed7dec942f3f4","graphName":"Software Heritage"} +{"eoscId":"eosc.acdh-ch.3b0149bee976d6db7eef053159e97a87","graphId":"10|re3data_____::69162d0a40bab7cc80b40ec90da874b9","graphName":"ARCHE"} +{"eoscId":"eosc.rli.661cdfdc74561b8eb69583b8137799d2","graphId":"10|fairsharing_::0cbed40c0d920b94126eaf5e707be1f5","graphName":"Open Energy Platform"} +{"eoscId":"eosc.bbmri-eric.314cee7546a7489c2cc3ab79d34e2640","graphId":"10|fairsharing_::4491777b1aa8b5b32c2e8666dbe1a495","graphName":"BBMRI-ERIC Directory"} +{"eoscId":"eosc.unibi-ub.a61d9ea844bdf43e6feabd6b14dfe3c5","graphId":"10|opendoar____::229754d7799160502a143a72f6789927","graphName":"Publications at Bielefeld University"} +{"eoscId":"eosc.ku_leuven.68bf19ae7ee1bc7e3872255e96550c04","graphId":"10|eurocrisdris::64e793796c88e19742bffb84023872cf","graphName":"Leuven Institutional Repository and Information Archiving System(LIRIAS)"} +{"eoscId":"eosc.olos.olos","graphId":"10|re3data_____::3b7cffda8156920b25a5f1831537f930","graphName":"OLOS"} +{"eoscId":"eosc.wenmr.d288225c333b07fc9d001da5c5392741","graphId":"10|fairsharing_::acab0116c354964a558e65bdd07ff047","graphName":"MetalPDB"} +{"eoscId":"eosc.zpid.b96341f00ca4c3a314abcc07fc0084b2","graphId":"10|fairsharing_::f9ff6540c092abd6a77908c034710a04","graphName":"PsychArchives"} +{"eoscId":"eosc.dsmz.bacdive__the_bacterial_diversity_metadatabase","graphId":"10|fairsharing_::a8345c3bb9e3896ea538ce77ffaf2c20","graphName":"Bacterial Diversity Metadatabase"} +{"eoscId":"eosc.gesis.doi_registration_service","graphId":"10|fairsharing_::b5baa9c23ac3e015ad287b17a3d4afa3","graphName":"da|ra"} +{"eoscId":"ni4os.grnet.ni4os-europe_repository_service","graphId":"10|opendoar____::ad80947c9909dd9d70739ca2b8f3fd2d","graphName":"NI4OS Europe Repository"} +{"eoscId":"eosc.charite_bih_brain_simulation.vre","graphId":"10|fairsharing_::159c1ffe5b61b41b3c4d8f4c2150f6c4","graphName":"Virtual Research Environment"} +{"eoscId":"eosc.riga_stradins_university.4ea61809e753e65a459bbe4a492c773b","graphId":"10|re3data_____::fd32b03d9f88de45c1ac672f8ccf4071","graphName":"Riga Stradins University dataverse"} +{"eoscId":"eosc.dkrz.9ffffb05aaf22e7f9138dca4560a8c8b","graphId":"10|re3data_____::d18674e211f46fb9e49b1d3122a9c01e","graphName":"World Data Center for Climate at DKRZ"} +{"eoscId":"eosc.vamdc.c967f669aa354e584e6786ee1d0c823e","graphId":"10|re3data_____::f7fca0a726f12b706e54c309d94364f9","graphName":"VAMDC Portal"} +{"eoscId":"eosc.psi.f1a79f572f95bc2fbea5cdc40ef4eb22","graphId":"10|re3data_____::1e55174ff77ed2d804871281201dbb50","graphName":"PSI Open Data Provider"} +{"eoscId":"eosc.openaire.2bb8710e1870170a175110615698e677","graphId":"10|openaire____::e034d6a11054f5ade9221ebac484e864","graphName":"ScholExplorer"} +{"eoscId":"eosc.elixir-uk.5126ffcc8e23f65bbbe219d36128f2c8","graphId":"10|fairsharing_::c8cd63e1bf13c5016881652983fb615a","graphName":"WorkflowHub"} +{"eoscId":"eosc.gdansk_tech.1434de11c83986b5be5592677f28d171","graphId":"10|re3data_____::b099df0bc21422bd4288486c034b43de","graphName":"Most Wiedzy Open Research Data Catalog"} +{"eoscId":"eosc.gbif.14ac40283813a624bd74ae82605ded23","graphId":"10|re3data_____::194f60618405f8d2dc58ea68d968a104","graphName":"Global Biodiversity Information Facility"} +{"eoscId":"eosc.vliz.61c6dae33d794d477e6a68ed43f52eb3","graphId":"10|fairsharing_::c6036a69be21cb660499b75718a3ef24","graphName":"World Register of Marine Species"} +{"eoscId":"eosc.ku_leuven.1cb0937dc41e70d8126d7b259ad470af","graphId":"10|re3data_____::35fe5e9a5f59f9491ee04381119c5e3b","graphName":"KU Leuven RDR"} +{"eoscId":"eosc.cern.8025243fa3c887159fc9b3930ae147c2","graphId":"10|re3data_____::ac9ac6a041c4159d498591638b47009e","graphName":"CERN Open Data"} +{"eoscId":"eosc.lapp.ef0bb7d889d0cced364444495f7a1e67","graphId":"10|re3data_____::c19518b015a3941a3e0675d398ca33f6","graphName":"Open-source Scientific Software and Service Repository"} +{"eoscId":"eosc.awi_bremerhaven.2882af227241cb956c28fe321a70dfb2","graphId":"10|re3data_____::9633d1e8c4309c833c2c442abeb0cfeb","graphName":"PANGAEA - Data Publisher for Earth and Environmental Science"} +{"eoscId":"eosc.inoe_2000.infra-art_spectral_library","graphId":"10|re3data_____::261b6dae636dee9f7fbd91b686ec9835","graphName":"INFRA-ART Spectral Library"} +{"eoscId":"eosc.ill.d422cba59746f39d10bdfea5c9cf8511","graphId":"10|re3data_____::ade37d060405975243eab9ce3f7aae78","graphName":"ILL Data Portal"} +{"eoscId":"eosc.hits.901e9baaa76d72017ebd7dfd93436caf","graphId":"10|fairsharing_::0233f3bb964cf325a30f8b1c2ed2da93","graphName":"FAIRDOMHub"} +{"eoscId":"eosc.openaire.0a02f13310296033694acead588a773b","graphId":"10|opendoar____::358aee4cc897452c00244351e4d91f69","graphName":"ZENODO"} +{"eoscId":"eosc.lifewatch-eric.ecoportal","graphId":"10|fairsharing_::9e6a921fbc428b5638b3986e365d4f21","graphName":"Lifewatch ERIC EcoPortal"} \ No newline at end of file