Merge pull request 'orcid-no-doi' (#123) from enrico.ottonello/dnet-hadoop:orcid-no-doi into beta

Reviewed-on: D-Net/dnet-hadoop#123
This commit is contained in:
Claudio Atzori 2021-07-15 17:59:41 +02:00
parent 7e2caafe84
commit bf9e0d2d4f
2 changed files with 32 additions and 10 deletions

View File

@ -4,6 +4,7 @@ package eu.dnetlib.doiboost.orcidnodoi;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
@ -32,10 +33,7 @@ import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.dhp.schema.orcid.*;
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
import eu.dnetlib.dhp.schema.orcid.Work;
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
import eu.dnetlib.doiboost.orcid.json.JsonHelper; import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcid.util.HDFSUtil; import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
@ -111,6 +109,10 @@ public class SparkGenEnrichedOrcidWorks {
Encoders.bean(WorkDetail.class)); Encoders.bean(WorkDetail.class));
logger.info("Works data loaded: " + workDataset.count()); logger.info("Works data loaded: " + workDataset.count());
final LongAccumulator warnNotFoundContributors = spark
.sparkContext()
.longAccumulator("warnNotFoundContributors");
JavaRDD<Tuple2<String, String>> enrichedWorksRDD = workDataset JavaRDD<Tuple2<String, String>> enrichedWorksRDD = workDataset
.joinWith( .joinWith(
authorDataset, authorDataset,
@ -119,7 +121,21 @@ public class SparkGenEnrichedOrcidWorks {
(MapFunction<Tuple2<WorkDetail, AuthorData>, Tuple2<String, String>>) value -> { (MapFunction<Tuple2<WorkDetail, AuthorData>, Tuple2<String, String>>) value -> {
WorkDetail w = value._1; WorkDetail w = value._1;
AuthorData a = value._2; AuthorData a = value._2;
if (w.getContributors() == null
|| (w.getContributors() != null && w.getContributors().size() == 0)) {
Contributor c = new Contributor();
c.setName(a.getName());
c.setSurname(a.getSurname());
c.setCreditName(a.getCreditName());
c.setOid(a.getOid());
List<Contributor> contributors = Arrays.asList(c);
w.setContributors(contributors);
if (warnNotFoundContributors != null) {
warnNotFoundContributors.add(1);
}
} else {
AuthorMatcher.match(a, w.getContributors()); AuthorMatcher.match(a, w.getContributors());
}
return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w)); return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w));
}, },
Encoders.tuple(Encoders.STRING(), Encoders.STRING())) Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
@ -172,7 +188,7 @@ public class SparkGenEnrichedOrcidWorks {
OBJECT_MAPPER.writeValueAsString(new AtomicAction<>(Publication.class, p)))) OBJECT_MAPPER.writeValueAsString(new AtomicAction<>(Publication.class, p))))
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
.saveAsNewAPIHadoopFile( .saveAsNewAPIHadoopFile(
workingPath.concat(outputEnrichedWorksPath), outputEnrichedWorksPath,
Text.class, Text.class,
Text.class, Text.class,
SequenceFileOutputFormat.class, SequenceFileOutputFormat.class,
@ -180,6 +196,7 @@ public class SparkGenEnrichedOrcidWorks {
logger.info("parsedPublications: " + parsedPublications.value().toString()); logger.info("parsedPublications: " + parsedPublications.value().toString());
logger.info("enrichedPublications: " + enrichedPublications.value().toString()); logger.info("enrichedPublications: " + enrichedPublications.value().toString());
logger.info("warnNotFoundContributors: " + warnNotFoundContributors.value().toString());
logger.info("errorsGeneric: " + errorsGeneric.value().toString()); logger.info("errorsGeneric: " + errorsGeneric.value().toString());
logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString()); logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString());
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString()); logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());

View File

@ -7,9 +7,14 @@
</property> </property>
<property> <property>
<name>outputPath</name> <name>outputPath</name>
<value>/data/orcid_activities_2020/no_doi_dataset_prod/</value>
<description>path where to store the action set</description> <description>path where to store the action set</description>
</property> </property>
<property>
<name>processOutputPath</name>
<value>/data/orcid_activities_2020/process_no_doi_dataset_prod</value>
<description>temporary path where to store the action set</description>
</property>
<property> <property>
<name>spark2GenNoDoiDatasetMaxExecutors</name> <name>spark2GenNoDoiDatasetMaxExecutors</name>
<value>40</value> <value>40</value>
@ -66,7 +71,7 @@
<action name="ResetWorkingPath"> <action name="ResetWorkingPath">
<fs> <fs>
<delete path='${workingPath}/no_doi_dataset'/> <delete path='${processOutputPath}'/>
</fs> </fs>
<ok to="GenOrcidNoDoiDataset"/> <ok to="GenOrcidNoDoiDataset"/>
<error to="Kill"/> <error to="Kill"/>
@ -92,7 +97,7 @@
<arg>--workingPath</arg><arg>${workingPath}/</arg> <arg>--workingPath</arg><arg>${workingPath}/</arg>
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg> <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
<arg>--orcidDataFolder</arg><arg>last_orcid_dataset</arg> <arg>--orcidDataFolder</arg><arg>last_orcid_dataset</arg>
<arg>--outputEnrichedWorksPath</arg><arg>no_doi_dataset</arg> <arg>--outputEnrichedWorksPath</arg><arg>${processOutputPath}</arg>
</spark> </spark>
<ok to="importOrcidNoDoi"/> <ok to="importOrcidNoDoi"/>
<error to="Kill"/> <error to="Kill"/>
@ -100,7 +105,7 @@
<action name="importOrcidNoDoi"> <action name="importOrcidNoDoi">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${workingPath}/no_doi_dataset/*</arg> <arg>${processOutputPath}/*</arg>
<arg>${outputPath}</arg> <arg>${outputPath}</arg>
</distcp> </distcp>
<ok to="End"/> <ok to="End"/>