forked from D-Net/dnet-hadoop
dateOfCollection taken from orcid last_update.txt on hdfs; cleaned wf parameters
This commit is contained in:
parent
d43ea88caf
commit
53d7023460
|
@ -8,7 +8,9 @@ import java.util.List;
|
|||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -57,26 +59,33 @@ public class SparkGenEnrichedOrcidWorks {
|
|||
.toString(
|
||||
SparkGenEnrichedOrcidWorks.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
|
||||
"/eu/dnetlib/dhp/doiboost/gen_orcid-no-doi_params.json")));
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||
final String workingPath = parser.get("workingPath");
|
||||
final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
|
||||
final String orcidDataFolder = parser.get("orcidDataFolder");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
||||
if (StringUtils.isBlank(lastUpdate)) {
|
||||
throw new RuntimeException("last update info not found");
|
||||
}
|
||||
final String dateOfCollection = lastUpdate.substring(0, 10);
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
Dataset<AuthorData> authorDataset = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.textFile(workingPath.concat("last_orcid_dataset/authors/*"))
|
||||
.textFile(workingPath.concat(orcidDataFolder).concat("/authors/*"))
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, AuthorSummary.class))
|
||||
.filter(authorSummary -> authorSummary.getAuthorData() != null)
|
||||
.map(authorSummary -> authorSummary.getAuthorData())
|
||||
|
@ -87,7 +96,7 @@ public class SparkGenEnrichedOrcidWorks {
|
|||
Dataset<WorkDetail> workDataset = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.textFile(workingPath.concat("last_orcid_dataset/works/*"))
|
||||
.textFile(workingPath.concat(orcidDataFolder).concat("/works/*"))
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Work.class))
|
||||
.filter(work -> work.getWorkDetail() != null)
|
||||
.map(work -> work.getWorkDetail())
|
||||
|
@ -134,7 +143,8 @@ public class SparkGenEnrichedOrcidWorks {
|
|||
errorsGeneric,
|
||||
errorsInvalidTitle,
|
||||
errorsNotFoundAuthors,
|
||||
errorsInvalidType);
|
||||
errorsInvalidType,
|
||||
dateOfCollection);
|
||||
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
|
||||
.map(
|
||||
e -> {
|
||||
|
|
|
@ -36,6 +36,7 @@ public class PublicationToOaf implements Serializable {
|
|||
public static final String OPENAIRE_PREFIX = "openaire____";
|
||||
public static final String SEPARATOR = "::";
|
||||
|
||||
private String dateOfCollection = "";
|
||||
private final LongAccumulator parsedPublications;
|
||||
private final LongAccumulator enrichedPublications;
|
||||
private final LongAccumulator errorsGeneric;
|
||||
|
@ -49,13 +50,15 @@ public class PublicationToOaf implements Serializable {
|
|||
LongAccumulator errorsGeneric,
|
||||
LongAccumulator errorsInvalidTitle,
|
||||
LongAccumulator errorsNotFoundAuthors,
|
||||
LongAccumulator errorsInvalidType) {
|
||||
LongAccumulator errorsInvalidType,
|
||||
String dateOfCollection) {
|
||||
this.parsedPublications = parsedPublications;
|
||||
this.enrichedPublications = enrichedPublications;
|
||||
this.errorsGeneric = errorsGeneric;
|
||||
this.errorsInvalidTitle = errorsInvalidTitle;
|
||||
this.errorsNotFoundAuthors = errorsNotFoundAuthors;
|
||||
this.errorsInvalidType = errorsInvalidType;
|
||||
this.dateOfCollection = dateOfCollection;
|
||||
}
|
||||
|
||||
public PublicationToOaf() {
|
||||
|
@ -137,7 +140,7 @@ public class PublicationToOaf implements Serializable {
|
|||
|
||||
publication.setLastupdatetimestamp(new Date().getTime());
|
||||
|
||||
publication.setDateofcollection("2020-10-14");
|
||||
publication.setDateofcollection(dateOfCollection);
|
||||
publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
|
||||
|
||||
// Adding external ids
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
[
|
||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"orcidDataFolder", "paramDescription": "the folder of orcid data", "paramRequired": true},
|
||||
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||
]
|
|
@ -85,8 +85,7 @@
|
|||
</spark-opts>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>-</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/</arg>
|
||||
<arg>-i</arg><arg>last_orcid_dataset</arg>
|
||||
<arg>-oew</arg><arg>no_doi_dataset</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
Loading…
Reference in New Issue