From e84f5b5e6457d8e47fea99a860cd27f41c67882e Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 2 Oct 2023 09:25:16 +0200 Subject: [PATCH] extended existing codo to accomodate import of POCI from open citation --- .../CreateActionSetSparkJob.java | 29 +++++++++++++------ .../opencitations/GetOpenCitationsRefs.java | 16 ++++++---- .../actionmanager/opencitations/ReadCOCI.java | 18 +++++++++--- .../opencitations/as_parameters.json | 5 ++++ .../opencitations/input_parameters.json | 6 ++++ .../input_readcoci_parameters.json | 7 ++++- .../opencitations/oozie_app/workflow.xml | 9 ++++-- 7 files changed, 67 insertions(+), 23 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java index 4c658e52f..dafd82120 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java @@ -26,7 +26,6 @@ import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; @@ -35,7 +34,9 @@ import scala.Tuple2; public class CreateActionSetSparkJob implements Serializable { public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations"; public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations"; - private static final String ID_PREFIX = "50|doi_________::"; + private static final String DOI_PREFIX = "50|doi_________::"; + + private static final String PMID_PREFIX = "50|pmid________::"; private static final String TRUST = "0.91"; private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class); @@ -67,6 +68,9 @@ public class CreateActionSetSparkJob implements Serializable { final String outputPath = parser.get("outputPath"); log.info("outputPath {}", outputPath); + final String prefix = parser.get("prefix"); + log.info("prefix {}", prefix); + final boolean shouldDuplicateRels = Optional .ofNullable(parser.get("shouldDuplicateRels")) .map(Boolean::valueOf) @@ -77,13 +81,13 @@ public class CreateActionSetSparkJob implements Serializable { conf, isSparkSessionManaged, spark -> { - extractContent(spark, inputPath, outputPath, shouldDuplicateRels); + extractContent(spark, inputPath, outputPath, shouldDuplicateRels, prefix); }); } private static void extractContent(SparkSession spark, String inputPath, String outputPath, - boolean shouldDuplicateRels) { + boolean shouldDuplicateRels, String prefix) { spark .read() .textFile(inputPath + "/*") @@ -91,7 +95,8 @@ public class CreateActionSetSparkJob implements Serializable { (MapFunction) value -> OBJECT_MAPPER.readValue(value, COCI.class), Encoders.bean(COCI.class)) .flatMap( - (FlatMapFunction) value -> createRelation(value, shouldDuplicateRels).iterator(), + (FlatMapFunction) value -> createRelation(value, shouldDuplicateRels, prefix) + .iterator(), Encoders.bean(Relation.class)) .filter((FilterFunction) value -> value != null) .toJavaRDD() @@ -103,13 +108,19 @@ public class CreateActionSetSparkJob implements Serializable { } - private static List createRelation(COCI value, boolean duplicate) { + private static List createRelation(COCI value, boolean duplicate, String p) { List relationList = new ArrayList<>(); + String prefix; + if (p.equals("COCI")) { + prefix = DOI_PREFIX; + } else { + prefix = PMID_PREFIX; + } - String citing = ID_PREFIX + String citing = prefix + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting())); - final String cited = ID_PREFIX + final String cited = prefix + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited())); if (!citing.equals(cited)) { @@ -120,7 +131,7 @@ public class CreateActionSetSparkJob implements Serializable { cited, ModelConstants.CITES)); if (duplicate && value.getCiting().endsWith(".refs")) { - citing = ID_PREFIX + IdentifierFactory + citing = prefix + IdentifierFactory .md5( CleaningFunctions .normalizePidValue( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java index 3530c9980..60dc998ef 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java @@ -45,6 +45,9 @@ public class GetOpenCitationsRefs implements Serializable { final String hdfsNameNode = parser.get("hdfsNameNode"); log.info("hdfsNameNode {}", hdfsNameNode); + final String prefix = parser.get("prefix"); + log.info("prefix {}", prefix); + Configuration conf = new Configuration(); conf.set("fs.defaultFS", hdfsNameNode); @@ -53,30 +56,31 @@ public class GetOpenCitationsRefs implements Serializable { GetOpenCitationsRefs ocr = new GetOpenCitationsRefs(); for (String file : inputFile) { - ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem); + ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem, prefix); } } - private void doExtract(String inputFile, String workingPath, FileSystem fileSystem) + private void doExtract(String inputFile, String workingPath, FileSystem fileSystem, String prefix) throws IOException { final Path path = new Path(inputFile); FSDataInputStream oc_zip = fileSystem.open(path); - int count = 1; + // int count = 1; try (ZipInputStream zis = new ZipInputStream(oc_zip)) { ZipEntry entry = null; while ((entry = zis.getNextEntry()) != null) { if (!entry.isDirectory()) { String fileName = entry.getName(); - fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count; - count++; + // fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count; + fileName = fileName.substring(0, fileName.lastIndexOf(".")); + // count++; try ( FSDataOutputStream out = fileSystem - .create(new Path(workingPath + "/COCI/" + fileName + ".gz")); + .create(new Path(workingPath + "/" + prefix + "/" + fileName + ".gz")); GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) { IOUtils.copy(zis, gzipOs); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java index 4293ca187..3d384de9d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java @@ -49,6 +49,9 @@ public class ReadCOCI implements Serializable { final String workingPath = parser.get("workingPath"); log.info("workingPath {}", workingPath); + final String format = parser.get("format"); + log.info("format {}", format); + SparkConf sconf = new SparkConf(); final String delimiter = Optional @@ -64,13 +67,14 @@ public class ReadCOCI implements Serializable { workingPath, inputFile, outputPath, - delimiter); + delimiter, + format); }); } private static void doRead(SparkSession spark, String workingPath, String[] inputFiles, String outputPath, - String delimiter) throws IOException { + String delimiter, String format) throws IOException { for (String inputFile : inputFiles) { String p_string = workingPath + "/" + inputFile + ".gz"; @@ -87,9 +91,15 @@ public class ReadCOCI implements Serializable { cociData.map((MapFunction) row -> { COCI coci = new COCI(); + if (format.equals("COCI")) { + coci.setCiting(row.getString(1)); + coci.setCited(row.getString(2)); + } else { + coci.setCiting(String.valueOf(row.getInt(1))); + coci.setCited(String.valueOf(row.getInt(2))); + } coci.setOci(row.getString(0)); - coci.setCiting(row.getString(1)); - coci.setCited(row.getString(2)); + return coci; }, Encoders.bean(COCI.class)) .write() diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json index 308e02026..e25d1f4b8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json @@ -21,5 +21,10 @@ "paramLongName": "shouldDuplicateRels", "paramDescription": "the hdfs name node", "paramRequired": false +},{ + "paramName": "p", + "paramLongName": "prefix", + "paramDescription": "the hdfs name node", + "paramRequired": true } ] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json index 4910ad11d..96db7eeb7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json @@ -16,5 +16,11 @@ "paramLongName": "hdfsNameNode", "paramDescription": "the hdfs name node", "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "prefix", + "paramDescription": "COCI or POCI", + "paramRequired": true } ] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json index b57cb5d9a..fa840089d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json @@ -30,7 +30,12 @@ "paramLongName": "inputFile", "paramDescription": "the hdfs name node", "paramRequired": true - } + }, { + "paramName": "f", + "paramLongName": "format", + "paramDescription": "the hdfs name node", + "paramRequired": true +} ] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml index 0f01039f7..4807da903 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml @@ -60,6 +60,7 @@ --hdfsNameNode${nameNode} --inputFile${inputFile} --workingPath${workingPath} + --prefix${prefix} @@ -82,10 +83,11 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --workingPath${workingPath}/COCI - --outputPath${workingPath}/COCI_JSON/ + --workingPath${workingPath}/${prefix} + --outputPath${workingPath}/${prefix}_JSON/ --delimiter${delimiter} --inputFile${inputFileCoci} + --format${prefix} @@ -108,8 +110,9 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${workingPath}/COCI_JSON + --inputPath${workingPath}/${prefix}_JSON --outputPath${outputPath} + --prefix${prefix}