forked from D-Net/dnet-hadoop
Merge pull request 'extended existing code to import of POCI from open citation' (#340) from importpoci into beta
Reviewed-on: D-Net/dnet-hadoop#340
This commit is contained in:
commit
f344ad76d0
|
@ -12,6 +12,7 @@ import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
@ -30,12 +31,16 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class CreateActionSetSparkJob implements Serializable {
|
public class CreateActionSetSparkJob implements Serializable {
|
||||||
public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations";
|
public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations";
|
||||||
public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations";
|
public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations";
|
||||||
private static final String ID_PREFIX = "50|doi_________::";
|
private static final String DOI_PREFIX = "50|doi_________::";
|
||||||
|
|
||||||
|
private static final String PMID_PREFIX = "50|pmid________::";
|
||||||
private static final String TRUST = "0.91";
|
private static final String TRUST = "0.91";
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
|
private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
|
||||||
|
@ -84,34 +89,56 @@ public class CreateActionSetSparkJob implements Serializable {
|
||||||
|
|
||||||
private static void extractContent(SparkSession spark, String inputPath, String outputPath,
|
private static void extractContent(SparkSession spark, String inputPath, String outputPath,
|
||||||
boolean shouldDuplicateRels) {
|
boolean shouldDuplicateRels) {
|
||||||
spark
|
|
||||||
|
getTextTextJavaPairRDD(spark, inputPath, shouldDuplicateRels, "COCI")
|
||||||
|
.union(getTextTextJavaPairRDD(spark, inputPath, shouldDuplicateRels, "POCI"))
|
||||||
|
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(SparkSession spark, String inputPath,
|
||||||
|
boolean shouldDuplicateRels, String prefix) {
|
||||||
|
return spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(inputPath + "/*")
|
.textFile(inputPath + "/" + prefix + "/" + prefix + "_JSON/*")
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<String, COCI>) value -> OBJECT_MAPPER.readValue(value, COCI.class),
|
(MapFunction<String, COCI>) value -> OBJECT_MAPPER.readValue(value, COCI.class),
|
||||||
Encoders.bean(COCI.class))
|
Encoders.bean(COCI.class))
|
||||||
.flatMap(
|
.flatMap(
|
||||||
(FlatMapFunction<COCI, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
|
(FlatMapFunction<COCI, Relation>) value -> createRelation(
|
||||||
|
value, shouldDuplicateRels, prefix)
|
||||||
|
.iterator(),
|
||||||
Encoders.bean(Relation.class))
|
Encoders.bean(Relation.class))
|
||||||
.filter((FilterFunction<Relation>) value -> value != null)
|
.filter((FilterFunction<Relation>) value -> value != null)
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.map(p -> new AtomicAction(p.getClass(), p))
|
.map(p -> new AtomicAction(p.getClass(), p))
|
||||||
.mapToPair(
|
.mapToPair(
|
||||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<Relation> createRelation(COCI value, boolean duplicate) {
|
private static List<Relation> createRelation(COCI value, boolean duplicate, String p) {
|
||||||
|
|
||||||
List<Relation> relationList = new ArrayList<>();
|
List<Relation> relationList = new ArrayList<>();
|
||||||
|
String prefix;
|
||||||
String citing = ID_PREFIX
|
String citing;
|
||||||
|
String cited;
|
||||||
|
if (p.equals("COCI")) {
|
||||||
|
prefix = DOI_PREFIX;
|
||||||
|
citing = prefix
|
||||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting()));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting()));
|
||||||
final String cited = ID_PREFIX
|
cited = prefix
|
||||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
prefix = PMID_PREFIX;
|
||||||
|
citing = prefix
|
||||||
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("pmid", value.getCiting()));
|
||||||
|
cited = prefix
|
||||||
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("pmid", value.getCited()));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
if (!citing.equals(cited)) {
|
if (!citing.equals(cited)) {
|
||||||
relationList
|
relationList
|
||||||
.add(
|
.add(
|
||||||
|
@ -120,7 +147,7 @@ public class CreateActionSetSparkJob implements Serializable {
|
||||||
cited, ModelConstants.CITES));
|
cited, ModelConstants.CITES));
|
||||||
|
|
||||||
if (duplicate && value.getCiting().endsWith(".refs")) {
|
if (duplicate && value.getCiting().endsWith(".refs")) {
|
||||||
citing = ID_PREFIX + IdentifierFactory
|
citing = prefix + IdentifierFactory
|
||||||
.md5(
|
.md5(
|
||||||
CleaningFunctions
|
CleaningFunctions
|
||||||
.normalizePidValue(
|
.normalizePidValue(
|
||||||
|
@ -132,59 +159,30 @@ public class CreateActionSetSparkJob implements Serializable {
|
||||||
return relationList;
|
return relationList;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Collection<Relation> getRelations(String citing, String cited) {
|
|
||||||
|
|
||||||
return Arrays
|
|
||||||
.asList(
|
|
||||||
getRelation(citing, cited, ModelConstants.CITES),
|
|
||||||
getRelation(cited, citing, ModelConstants.IS_CITED_BY));
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Relation getRelation(
|
public static Relation getRelation(
|
||||||
String source,
|
String source,
|
||||||
String target,
|
String target,
|
||||||
String relclass) {
|
String relclass) {
|
||||||
Relation r = new Relation();
|
|
||||||
r.setCollectedfrom(getCollectedFrom());
|
|
||||||
r.setSource(source);
|
|
||||||
r.setTarget(target);
|
|
||||||
r.setRelClass(relclass);
|
|
||||||
r.setRelType(ModelConstants.RESULT_RESULT);
|
|
||||||
r.setSubRelType(ModelConstants.CITATION);
|
|
||||||
r
|
|
||||||
.setDataInfo(
|
|
||||||
getDataInfo());
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<KeyValue> getCollectedFrom() {
|
return OafMapperUtils
|
||||||
KeyValue kv = new KeyValue();
|
.getRelation(
|
||||||
kv.setKey(ModelConstants.OPENOCITATIONS_ID);
|
source,
|
||||||
kv.setValue(ModelConstants.OPENOCITATIONS_NAME);
|
target,
|
||||||
|
ModelConstants.RESULT_RESULT,
|
||||||
return Arrays.asList(kv);
|
ModelConstants.CITATION,
|
||||||
}
|
relclass,
|
||||||
|
Arrays
|
||||||
public static DataInfo getDataInfo() {
|
.asList(
|
||||||
DataInfo di = new DataInfo();
|
OafMapperUtils.keyValue(ModelConstants.OPENOCITATIONS_ID, ModelConstants.OPENOCITATIONS_NAME)),
|
||||||
di.setInferred(false);
|
OafMapperUtils
|
||||||
di.setDeletedbyinference(false);
|
.dataInfo(
|
||||||
di.setTrust(TRUST);
|
false, null, false, false,
|
||||||
|
OafMapperUtils
|
||||||
di
|
.qualifier(
|
||||||
.setProvenanceaction(
|
OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME,
|
||||||
getQualifier(OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS));
|
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
return di;
|
TRUST),
|
||||||
}
|
null);
|
||||||
|
|
||||||
public static Qualifier getQualifier(String class_id, String class_name,
|
|
||||||
String qualifierSchema) {
|
|
||||||
Qualifier pa = new Qualifier();
|
|
||||||
pa.setClassid(class_id);
|
|
||||||
pa.setClassname(class_name);
|
|
||||||
pa.setSchemeid(qualifierSchema);
|
|
||||||
pa.setSchemename(qualifierSchema);
|
|
||||||
return pa;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -45,6 +45,9 @@ public class GetOpenCitationsRefs implements Serializable {
|
||||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||||
log.info("hdfsNameNode {}", hdfsNameNode);
|
log.info("hdfsNameNode {}", hdfsNameNode);
|
||||||
|
|
||||||
|
final String prefix = parser.get("prefix");
|
||||||
|
log.info("prefix {}", prefix);
|
||||||
|
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
conf.set("fs.defaultFS", hdfsNameNode);
|
conf.set("fs.defaultFS", hdfsNameNode);
|
||||||
|
|
||||||
|
@ -53,30 +56,31 @@ public class GetOpenCitationsRefs implements Serializable {
|
||||||
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
|
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
|
||||||
|
|
||||||
for (String file : inputFile) {
|
for (String file : inputFile) {
|
||||||
ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem);
|
ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doExtract(String inputFile, String workingPath, FileSystem fileSystem)
|
private void doExtract(String inputFile, String workingPath, FileSystem fileSystem, String prefix)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
final Path path = new Path(inputFile);
|
final Path path = new Path(inputFile);
|
||||||
|
|
||||||
FSDataInputStream oc_zip = fileSystem.open(path);
|
FSDataInputStream oc_zip = fileSystem.open(path);
|
||||||
|
|
||||||
int count = 1;
|
// int count = 1;
|
||||||
try (ZipInputStream zis = new ZipInputStream(oc_zip)) {
|
try (ZipInputStream zis = new ZipInputStream(oc_zip)) {
|
||||||
ZipEntry entry = null;
|
ZipEntry entry = null;
|
||||||
while ((entry = zis.getNextEntry()) != null) {
|
while ((entry = zis.getNextEntry()) != null) {
|
||||||
|
|
||||||
if (!entry.isDirectory()) {
|
if (!entry.isDirectory()) {
|
||||||
String fileName = entry.getName();
|
String fileName = entry.getName();
|
||||||
fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count;
|
// fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count;
|
||||||
count++;
|
fileName = fileName.substring(0, fileName.lastIndexOf("."));
|
||||||
|
// count++;
|
||||||
try (
|
try (
|
||||||
FSDataOutputStream out = fileSystem
|
FSDataOutputStream out = fileSystem
|
||||||
.create(new Path(workingPath + "/COCI/" + fileName + ".gz"));
|
.create(new Path(workingPath + "/" + prefix + "/" + fileName + ".gz"));
|
||||||
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
|
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
|
||||||
|
|
||||||
IOUtils.copy(zis, gzipOs);
|
IOUtils.copy(zis, gzipOs);
|
||||||
|
|
|
@ -49,6 +49,9 @@ public class ReadCOCI implements Serializable {
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
log.info("workingPath {}", workingPath);
|
log.info("workingPath {}", workingPath);
|
||||||
|
|
||||||
|
final String format = parser.get("format");
|
||||||
|
log.info("format {}", format);
|
||||||
|
|
||||||
SparkConf sconf = new SparkConf();
|
SparkConf sconf = new SparkConf();
|
||||||
|
|
||||||
final String delimiter = Optional
|
final String delimiter = Optional
|
||||||
|
@ -64,13 +67,14 @@ public class ReadCOCI implements Serializable {
|
||||||
workingPath,
|
workingPath,
|
||||||
inputFile,
|
inputFile,
|
||||||
outputPath,
|
outputPath,
|
||||||
delimiter);
|
delimiter,
|
||||||
|
format);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
|
private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
|
||||||
String outputPath,
|
String outputPath,
|
||||||
String delimiter) throws IOException {
|
String delimiter, String format) throws IOException {
|
||||||
|
|
||||||
for (String inputFile : inputFiles) {
|
for (String inputFile : inputFiles) {
|
||||||
String p_string = workingPath + "/" + inputFile + ".gz";
|
String p_string = workingPath + "/" + inputFile + ".gz";
|
||||||
|
@ -87,9 +91,15 @@ public class ReadCOCI implements Serializable {
|
||||||
|
|
||||||
cociData.map((MapFunction<Row, COCI>) row -> {
|
cociData.map((MapFunction<Row, COCI>) row -> {
|
||||||
COCI coci = new COCI();
|
COCI coci = new COCI();
|
||||||
coci.setOci(row.getString(0));
|
if (format.equals("COCI")) {
|
||||||
coci.setCiting(row.getString(1));
|
coci.setCiting(row.getString(1));
|
||||||
coci.setCited(row.getString(2));
|
coci.setCited(row.getString(2));
|
||||||
|
} else {
|
||||||
|
coci.setCiting(String.valueOf(row.getInt(1)));
|
||||||
|
coci.setCited(String.valueOf(row.getInt(2)));
|
||||||
|
}
|
||||||
|
coci.setOci(row.getString(0));
|
||||||
|
|
||||||
return coci;
|
return coci;
|
||||||
}, Encoders.bean(COCI.class))
|
}, Encoders.bean(COCI.class))
|
||||||
.write()
|
.write()
|
||||||
|
|
|
@ -21,5 +21,10 @@
|
||||||
"paramLongName": "shouldDuplicateRels",
|
"paramLongName": "shouldDuplicateRels",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
|
},{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "prefix",
|
||||||
|
"paramDescription": "the hdfs name node",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -16,5 +16,11 @@
|
||||||
"paramLongName": "hdfsNameNode",
|
"paramLongName": "hdfsNameNode",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "prefix",
|
||||||
|
"paramDescription": "COCI or POCI",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -30,6 +30,11 @@
|
||||||
"paramLongName": "inputFile",
|
"paramLongName": "inputFile",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
}, {
|
||||||
|
"paramName": "f",
|
||||||
|
"paramLongName": "format",
|
||||||
|
"paramDescription": "the hdfs name node",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,7 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
<exec>download.sh</exec>
|
<exec>download.sh</exec>
|
||||||
<argument>${filelist}</argument>
|
<argument>${filelist}</argument>
|
||||||
<argument>${workingPath}/Original</argument>
|
<argument>${workingPath}/${prefix}/Original</argument>
|
||||||
<env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
|
<env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
|
||||||
<file>download.sh</file>
|
<file>download.sh</file>
|
||||||
<capture-output/>
|
<capture-output/>
|
||||||
|
@ -59,7 +59,8 @@
|
||||||
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
|
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
<arg>--inputFile</arg><arg>${inputFile}</arg>
|
<arg>--inputFile</arg><arg>${inputFile}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}/${prefix}</arg>
|
||||||
|
<arg>--prefix</arg><arg>${prefix}</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="read"/>
|
<ok to="read"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -82,10 +83,11 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
|
<arg>--workingPath</arg><arg>${workingPath}/${prefix}/${prefix}</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingPath}/COCI_JSON/</arg>
|
<arg>--outputPath</arg><arg>${workingPath}/${prefix}/${prefix}_JSON/</arg>
|
||||||
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
||||||
<arg>--inputFile</arg><arg>${inputFileCoci}</arg>
|
<arg>--inputFile</arg><arg>${inputFileCoci}</arg>
|
||||||
|
<arg>--format</arg><arg>${prefix}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="create_actionset"/>
|
<ok to="create_actionset"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -108,8 +110,9 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
|
<arg>--inputPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
|
<arg>--prefix</arg><arg>${prefix}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
Loading…
Reference in New Issue