extended existing codo to accomodate import of POCI from open citation

2023-10-02 09:25:16 +02:00 · 2023-10-02 09:25:16 +02:00 · e84f5b5e64
parent 0935d7757c
commit e84f5b5e64
7 changed files with 67 additions and 23 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java
@ -26,7 +26,6 @@ import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
@ -35,7 +34,9 @@ import scala.Tuple2;
 public class CreateActionSetSparkJob implements Serializable {
 	public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations";
 	public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations";
-	private static final String ID_PREFIX = "50|doi_________::";
+	private static final String DOI_PREFIX = "50|doi_________::";
+
+	private static final String PMID_PREFIX = "50|pmid________::";
 	private static final String TRUST = "0.91";

 	private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
@ -67,6 +68,9 @@ public class CreateActionSetSparkJob implements Serializable {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath {}", outputPath);

+		final String prefix = parser.get("prefix");
+		log.info("prefix {}", prefix);
+
 		final boolean shouldDuplicateRels = Optional
 			.ofNullable(parser.get("shouldDuplicateRels"))
 			.map(Boolean::valueOf)
@ -77,13 +81,13 @@ public class CreateActionSetSparkJob implements Serializable {
 			conf,
 			isSparkSessionManaged,
 			spark -> {
-				extractContent(spark, inputPath, outputPath, shouldDuplicateRels);
+				extractContent(spark, inputPath, outputPath, shouldDuplicateRels, prefix);
 			});

 	}

 	private static void extractContent(SparkSession spark, String inputPath, String outputPath,
-		boolean shouldDuplicateRels) {
+		boolean shouldDuplicateRels, String prefix) {
 		spark
 			.read()
 			.textFile(inputPath + "/*")
@ -91,7 +95,8 @@ public class CreateActionSetSparkJob implements Serializable {
 				(MapFunction<String, COCI>) value -> OBJECT_MAPPER.readValue(value, COCI.class),
 				Encoders.bean(COCI.class))
 			.flatMap(
-				(FlatMapFunction<COCI, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
+				(FlatMapFunction<COCI, Relation>) value -> createRelation(value, shouldDuplicateRels, prefix)
+					.iterator(),
 				Encoders.bean(Relation.class))
 			.filter((FilterFunction<Relation>) value -> value != null)
 			.toJavaRDD()
@ -103,13 +108,19 @@ public class CreateActionSetSparkJob implements Serializable {

 	}

-	private static List<Relation> createRelation(COCI value, boolean duplicate) {
+	private static List<Relation> createRelation(COCI value, boolean duplicate, String p) {

 		List<Relation> relationList = new ArrayList<>();
+		String prefix;
+		if (p.equals("COCI")) {
+			prefix = DOI_PREFIX;
+		} else {
+			prefix = PMID_PREFIX;
+		}

-		String citing = ID_PREFIX
+		String citing = prefix
 			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting()));
-		final String cited = ID_PREFIX
+		final String cited = prefix
 			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));

 		if (!citing.equals(cited)) {
@ -120,7 +131,7 @@ public class CreateActionSetSparkJob implements Serializable {
 						cited, ModelConstants.CITES));

 			if (duplicate && value.getCiting().endsWith(".refs")) {
-				citing = ID_PREFIX + IdentifierFactory
+				citing = prefix + IdentifierFactory
 					.md5(
 						CleaningFunctions
 							.normalizePidValue(
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java
@ -45,6 +45,9 @@ public class GetOpenCitationsRefs implements Serializable {
 		final String hdfsNameNode = parser.get("hdfsNameNode");
 		log.info("hdfsNameNode {}", hdfsNameNode);

+		final String prefix = parser.get("prefix");
+		log.info("prefix {}", prefix);
+
 		Configuration conf = new Configuration();
 		conf.set("fs.defaultFS", hdfsNameNode);

@ -53,30 +56,31 @@ public class GetOpenCitationsRefs implements Serializable {
 		GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();

 		for (String file : inputFile) {
-			ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem);
+			ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem, prefix);
 		}

 	}

-	private void doExtract(String inputFile, String workingPath, FileSystem fileSystem)
+	private void doExtract(String inputFile, String workingPath, FileSystem fileSystem, String prefix)
 		throws IOException {

 		final Path path = new Path(inputFile);

 		FSDataInputStream oc_zip = fileSystem.open(path);

-		int count = 1;
+		// int count = 1;
 		try (ZipInputStream zis = new ZipInputStream(oc_zip)) {
 			ZipEntry entry = null;
 			while ((entry = zis.getNextEntry()) != null) {

 				if (!entry.isDirectory()) {
 					String fileName = entry.getName();
-					fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count;
-					count++;
+					// fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count;
+					fileName = fileName.substring(0, fileName.lastIndexOf("."));
+					// count++;
 					try (
 						FSDataOutputStream out = fileSystem
-							.create(new Path(workingPath + "/COCI/" + fileName + ".gz"));
+							.create(new Path(workingPath + "/" + prefix + "/" + fileName + ".gz"));
 						GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {

 						IOUtils.copy(zis, gzipOs);
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java
@ -49,6 +49,9 @@ public class ReadCOCI implements Serializable {
 		final String workingPath = parser.get("workingPath");
 		log.info("workingPath {}", workingPath);

+		final String format = parser.get("format");
+		log.info("format {}", format);
+
 		SparkConf sconf = new SparkConf();

 		final String delimiter = Optional
@ -64,13 +67,14 @@ public class ReadCOCI implements Serializable {
 					workingPath,
 					inputFile,
 					outputPath,
-					delimiter);
+					delimiter,
+					format);
 			});
 	}

 	private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
 		String outputPath,
-		String delimiter) throws IOException {
+		String delimiter, String format) throws IOException {

 		for (String inputFile : inputFiles) {
 			String p_string = workingPath + "/" + inputFile + ".gz";
@ -87,9 +91,15 @@ public class ReadCOCI implements Serializable {

 			cociData.map((MapFunction<Row, COCI>) row -> {
 				COCI coci = new COCI();
+				if (format.equals("COCI")) {
+					coci.setCiting(row.getString(1));
+					coci.setCited(row.getString(2));
+				} else {
+					coci.setCiting(String.valueOf(row.getInt(1)));
+					coci.setCited(String.valueOf(row.getInt(2)));
+				}
 				coci.setOci(row.getString(0));
-				coci.setCiting(row.getString(1));
-				coci.setCited(row.getString(2));
+
 				return coci;
 			}, Encoders.bean(COCI.class))
 				.write()
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json
@ -21,5 +21,10 @@
  "paramLongName": "shouldDuplicateRels",
  "paramDescription": "the hdfs name node",
  "paramRequired": false
+},{
+  "paramName": "p",
+  "paramLongName": "prefix",
+  "paramDescription": "the hdfs name node",
+  "paramRequired": true
 }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json
@ -16,5 +16,11 @@
    "paramLongName": "hdfsNameNode",
    "paramDescription": "the hdfs name node",
    "paramRequired": true
+  },
+  {
+    "paramName": "p",
+    "paramLongName": "prefix",
+    "paramDescription": "COCI or POCI",
+    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json
@ -30,7 +30,12 @@
    "paramLongName": "inputFile",
    "paramDescription": "the hdfs name node",
    "paramRequired": true
-  }
+  }, {
+  "paramName": "f",
+  "paramLongName": "format",
+  "paramDescription": "the hdfs name node",
+  "paramRequired": true
+}
 ]


--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml
@ -60,6 +60,7 @@
            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
            <arg>--inputFile</arg><arg>${inputFile}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--prefix</arg><arg>${prefix}</arg>
        </java>
        <ok to="read"/>
        <error to="Kill"/>
@ -82,10 +83,11 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
-            <arg>--outputPath</arg><arg>${workingPath}/COCI_JSON/</arg>
+            <arg>--workingPath</arg><arg>${workingPath}/${prefix}</arg>
+            <arg>--outputPath</arg><arg>${workingPath}/${prefix}_JSON/</arg>
            <arg>--delimiter</arg><arg>${delimiter}</arg>
            <arg>--inputFile</arg><arg>${inputFileCoci}</arg>
+            <arg>--format</arg><arg>${prefix}</arg>
        </spark>
        <ok to="create_actionset"/>
        <error to="Kill"/>
@ -108,8 +110,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
-            <arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
+            <arg>--inputPath</arg><arg>${workingPath}/${prefix}_JSON</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
+            <arg>--prefix</arg><arg>${prefix}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>