From 9898470b0e2c5b05ae73c5b2ae404f6f52cba2f4 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 2 Oct 2023 12:54:16 +0200 Subject: [PATCH] Addressing comments in https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/340\#issuecomment-10592 --- .../CreateActionSetSparkJob.java | 119 ++++++++---------- .../opencitations/oozie_app/workflow.xml | 2 +- 2 files changed, 54 insertions(+), 67 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java index dafd82120..e3a9833b3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java @@ -12,6 +12,7 @@ import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; @@ -26,9 +27,12 @@ import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; public class CreateActionSetSparkJob implements Serializable { @@ -68,9 +72,6 @@ public class CreateActionSetSparkJob implements Serializable { final String outputPath = parser.get("outputPath"); log.info("outputPath {}", outputPath); - final String prefix = parser.get("prefix"); - log.info("prefix {}", prefix); - final boolean shouldDuplicateRels = Optional .ofNullable(parser.get("shouldDuplicateRels")) .map(Boolean::valueOf) @@ -81,47 +82,62 @@ public class CreateActionSetSparkJob implements Serializable { conf, isSparkSessionManaged, spark -> { - extractContent(spark, inputPath, outputPath, shouldDuplicateRels, prefix); + extractContent(spark, inputPath, outputPath, shouldDuplicateRels); }); } private static void extractContent(SparkSession spark, String inputPath, String outputPath, + boolean shouldDuplicateRels) { + + getTextTextJavaPairRDD(spark, inputPath, shouldDuplicateRels, "COCI") + .union(getTextTextJavaPairRDD(spark, inputPath, shouldDuplicateRels, "POCI")) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + + } + + private static JavaPairRDD getTextTextJavaPairRDD(SparkSession spark, String inputPath, boolean shouldDuplicateRels, String prefix) { - spark + return spark .read() - .textFile(inputPath + "/*") + .textFile(inputPath + "/" + prefix + "/" + prefix + "_JSON/*") .map( (MapFunction) value -> OBJECT_MAPPER.readValue(value, COCI.class), Encoders.bean(COCI.class)) .flatMap( - (FlatMapFunction) value -> createRelation(value, shouldDuplicateRels, prefix) - .iterator(), + (FlatMapFunction) value -> createRelation( + value, shouldDuplicateRels, prefix) + .iterator(), Encoders.bean(Relation.class)) .filter((FilterFunction) value -> value != null) .toJavaRDD() .map(p -> new AtomicAction(p.getClass(), p)) .mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), - new Text(OBJECT_MAPPER.writeValueAsString(aa)))) - .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); - + new Text(OBJECT_MAPPER.writeValueAsString(aa)))); } private static List createRelation(COCI value, boolean duplicate, String p) { List relationList = new ArrayList<>(); String prefix; + String citing; + String cited; if (p.equals("COCI")) { prefix = DOI_PREFIX; + citing = prefix + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting())); + cited = prefix + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited())); + } else { prefix = PMID_PREFIX; - } + citing = prefix + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("pmid", value.getCiting())); + cited = prefix + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("pmid", value.getCited())); - String citing = prefix - + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting())); - final String cited = prefix - + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited())); + } if (!citing.equals(cited)) { relationList @@ -143,59 +159,30 @@ public class CreateActionSetSparkJob implements Serializable { return relationList; } - private static Collection getRelations(String citing, String cited) { - - return Arrays - .asList( - getRelation(citing, cited, ModelConstants.CITES), - getRelation(cited, citing, ModelConstants.IS_CITED_BY)); - } - public static Relation getRelation( String source, String target, String relclass) { - Relation r = new Relation(); - r.setCollectedfrom(getCollectedFrom()); - r.setSource(source); - r.setTarget(target); - r.setRelClass(relclass); - r.setRelType(ModelConstants.RESULT_RESULT); - r.setSubRelType(ModelConstants.CITATION); - r - .setDataInfo( - getDataInfo()); - return r; + + return OafMapperUtils + .getRelation( + source, + target, + ModelConstants.RESULT_RESULT, + ModelConstants.CITATION, + relclass, + Arrays + .asList( + OafMapperUtils.keyValue(ModelConstants.OPENOCITATIONS_ID, ModelConstants.OPENOCITATIONS_NAME)), + OafMapperUtils + .dataInfo( + false, null, false, false, + OafMapperUtils + .qualifier( + OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), + TRUST), + null); + } - - public static List getCollectedFrom() { - KeyValue kv = new KeyValue(); - kv.setKey(ModelConstants.OPENOCITATIONS_ID); - kv.setValue(ModelConstants.OPENOCITATIONS_NAME); - - return Arrays.asList(kv); - } - - public static DataInfo getDataInfo() { - DataInfo di = new DataInfo(); - di.setInferred(false); - di.setDeletedbyinference(false); - di.setTrust(TRUST); - - di - .setProvenanceaction( - getQualifier(OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS)); - return di; - } - - public static Qualifier getQualifier(String class_id, String class_name, - String qualifierSchema) { - Qualifier pa = new Qualifier(); - pa.setClassid(class_id); - pa.setClassname(class_name); - pa.setSchemeid(qualifierSchema); - pa.setSchemename(qualifierSchema); - return pa; - } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml index 4807da903..bd1932dd5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml @@ -110,7 +110,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${workingPath}/${prefix}_JSON + --inputPath${workingPath} --outputPath${outputPath} --prefix${prefix}