dnet-hadoop/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java

216 lines
6.5 KiB
Java
Raw Normal View History

2021-09-27 16:02:06 +02:00
package eu.dnetlib.dhp.actionmanager.opencitations;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
2023-10-06 14:42:02 +02:00
import org.apache.hadoop.io.compress.GzipCodec;
2021-09-27 16:02:06 +02:00
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
2021-09-27 16:02:06 +02:00
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
2021-09-27 16:02:06 +02:00
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
2024-03-12 23:12:32 +01:00
import com.fasterxml.jackson.core.JsonProcessingException;
2021-09-27 16:02:06 +02:00
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
2021-09-27 16:02:06 +02:00
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
2024-03-06 13:42:00 +01:00
import eu.dnetlib.dhp.schema.common.ModelSupport;
2021-09-27 16:02:06 +02:00
import eu.dnetlib.dhp.schema.oaf.*;
2023-10-10 09:36:11 +02:00
import eu.dnetlib.dhp.schema.oaf.utils.*;
import eu.dnetlib.dhp.utils.DHPUtils;
2021-09-27 16:02:06 +02:00
import scala.Tuple2;
public class CreateActionSetSparkJob implements Serializable {
public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations";
public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations";
2023-10-04 12:32:05 +02:00
private static final String DOI_PREFIX = "50|doi_________::";
private static final String PMID_PREFIX = "50|pmid________::";
2024-03-12 23:12:32 +01:00
private static final String ARXIV_PREFIX = "50|arXiv_______::";
2023-10-04 12:32:05 +02:00
2024-03-12 23:12:32 +01:00
private static final String PMCID_PREFIX = "50|pmcid_______::";
2021-09-27 16:02:06 +02:00
private static final String TRUST = "0.91";
private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
2023-10-04 12:32:05 +02:00
2021-09-27 16:02:06 +02:00
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
CreateActionSetSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json"))));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
2023-10-04 12:32:05 +02:00
log.info("inputPath {}", inputPath);
2021-09-27 16:02:06 +02:00
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
2024-03-12 23:12:32 +01:00
spark -> extractContent(spark, inputPath, outputPath));
2021-09-27 16:02:06 +02:00
}
2024-03-12 23:12:32 +01:00
private static void extractContent(SparkSession spark, String inputPath, String outputPath) {
2024-03-06 13:42:00 +01:00
getTextTextJavaPairRDD(spark, inputPath)
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}
2024-03-06 13:42:00 +01:00
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(SparkSession spark, String inputPath) {
return spark
.read()
2024-03-06 13:42:00 +01:00
.textFile(inputPath)
.map(
(MapFunction<String, COCI>) value -> OBJECT_MAPPER.readValue(value, COCI.class),
Encoders.bean(COCI.class))
2021-09-27 16:02:06 +02:00
.flatMap(
(FlatMapFunction<COCI, Relation>) value -> createRelation(
2024-03-06 13:42:00 +01:00
value)
.iterator(),
2021-09-27 16:02:06 +02:00
Encoders.bean(Relation.class))
2023-10-04 12:32:05 +02:00
.filter((FilterFunction<Relation>) Objects::nonNull)
2021-09-27 16:02:06 +02:00
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
2021-09-27 16:02:06 +02:00
}
2024-03-12 23:12:32 +01:00
private static List<Relation> createRelation(COCI value) throws JsonProcessingException {
2021-09-27 16:02:06 +02:00
List<Relation> relationList = new ArrayList<>();
2024-03-06 13:42:00 +01:00
String citing;
String cited;
2021-09-27 16:02:06 +02:00
2024-03-06 13:42:00 +01:00
switch (value.getCiting_pid()) {
case "doi":
citing = DOI_PREFIX
2023-10-04 12:32:05 +02:00
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), value.getCiting()));
2024-03-06 13:42:00 +01:00
break;
case "pmid":
citing = PMID_PREFIX
2023-10-04 12:32:05 +02:00
+ IdentifierFactory
2024-03-12 23:12:32 +01:00
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), value.getCiting()));
break;
case "arxiv":
citing = ARXIV_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.arXiv.toString(), value.getCiting()));
break;
case "pmcid":
citing = PMCID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), value.getCiting()));
2023-10-04 12:32:05 +02:00
break;
2024-03-12 23:12:32 +01:00
case "isbn":
case "issn":
return relationList;
2024-03-06 13:42:00 +01:00
default:
2024-03-12 23:12:32 +01:00
throw new IllegalStateException("Invalid prefix: " + new ObjectMapper().writeValueAsString(value));
2024-03-06 13:42:00 +01:00
}
switch (value.getCited_pid()) {
case "doi":
cited = DOI_PREFIX
2023-10-04 12:32:05 +02:00
+ IdentifierFactory
2024-03-12 23:12:32 +01:00
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), value.getCited()));
2024-03-06 13:42:00 +01:00
break;
case "pmid":
cited = PMID_PREFIX
2023-10-04 12:32:05 +02:00
+ IdentifierFactory
2024-03-12 23:12:32 +01:00
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), value.getCited()));
break;
case "arxiv":
cited = ARXIV_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.arXiv.toString(), value.getCited()));
break;
case "pmcid":
cited = PMCID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), value.getCited()));
2023-10-04 12:32:05 +02:00
break;
2024-03-12 23:12:32 +01:00
case "isbn":
case "issn":
return relationList;
2023-10-04 12:32:05 +02:00
default:
2024-03-12 23:12:32 +01:00
throw new IllegalStateException("Invalid prefix: " + new ObjectMapper().writeValueAsString(value));
}
2021-09-27 16:02:06 +02:00
2022-02-18 15:19:15 +01:00
if (!citing.equals(cited)) {
relationList
.add(
getRelation(
2022-02-18 15:19:15 +01:00
citing,
cited, ModelConstants.CITES));
2021-09-27 16:02:06 +02:00
}
return relationList;
}
public static Relation getRelation(
String source,
String target,
2023-10-04 12:32:05 +02:00
String relClass) {
2021-09-27 16:02:06 +02:00
return OafMapperUtils
.getRelation(
source,
target,
ModelConstants.RESULT_RESULT,
ModelConstants.CITATION,
2023-10-04 12:32:05 +02:00
relClass,
Arrays
.asList(
OafMapperUtils.keyValue(ModelConstants.OPENOCITATIONS_ID, ModelConstants.OPENOCITATIONS_NAME)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
TRUST),
null);
2021-09-27 16:02:06 +02:00
}
2023-10-04 12:32:05 +02:00
2021-09-27 16:02:06 +02:00
}