orcid-no-doi #43

Merged
claudio.atzori merged 45 commits from enrico.ottonello/dnet-hadoop:orcid-no-doi into master 2020-12-02 10:55:12 +01:00
13 changed files with 58 additions and 4 deletions
Showing only changes of commit 9e8e7fe6ef - Show all commits

View File

@ -3,6 +3,10 @@ package eu.dnetlib.doiboost.orcid.model;
import java.io.Serializable; import java.io.Serializable;
/**
* This class models the data that are retrieved from orcid publication
*/
public class AuthorData implements Serializable { public class AuthorData implements Serializable {
private String oid; private String oid;

View File

@ -20,10 +20,14 @@ import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.mortbay.log.Log; import org.mortbay.log.Log;
import eu.dnetlib.doiboost.orcid.json.JsonHelper; import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
/**
* This class write on hdfs one sequence file, the key is an orcid identifier and the
* value is an orcid publication in json format
*/
public class ActivitiesDumpReader { public class ActivitiesDumpReader {
private static final int MAX_XML_WORKS_PARSED = -1; private static final int MAX_XML_WORKS_PARSED = -1;

View File

@ -12,11 +12,15 @@ import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.OrcidDSManager; import eu.dnetlib.doiboost.orcid.OrcidDSManager;
/**
* This job generates one sequence file, the key is an orcid identifier and the
* value is an orcid publication in json format
*/
public class GenOrcidAuthorWork extends OrcidDSManager { public class GenOrcidAuthorWork extends OrcidDSManager {
private String activitiesFileNameTarGz; private String activitiesFileNameTarGz;
private String outputWorksPath; private String outputWorksPath;
// private String workingPath;
public static void main(String[] args) throws IOException, Exception { public static void main(String[] args) throws IOException, Exception {
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork(); GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();

View File

@ -22,7 +22,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonElement; import com.google.gson.JsonElement;
import com.google.gson.JsonParser; import com.google.gson.JsonParser;
@ -35,6 +34,10 @@ import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
import scala.Tuple2; import scala.Tuple2;
/**
* This spark job generates one parquet file, containing orcid publications dataset
*/
public class SparkGenEnrichedOrcidWorks { public class SparkGenEnrichedOrcidWorks {
static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class); static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);

View File

@ -6,6 +6,10 @@ import com.google.gson.JsonObject;
import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcid.model.WorkData;
/**
* This class converts an object to json and viceversa
*/
public class JsonWriter { public class JsonWriter {
public static String create(AuthorData authorData) { public static String create(AuthorData authorData) {

View File

@ -5,6 +5,10 @@ import java.io.Serializable;
import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.model.AuthorData;
/**
* This class models the data related to a contributor, that are retrieved from an orcid publication
*/
public class Contributor extends AuthorData implements Serializable { public class Contributor extends AuthorData implements Serializable {
private String sequence; private String sequence;
private String role; private String role;

View File

@ -1,6 +1,10 @@
package eu.dnetlib.doiboost.orcidnodoi.model; package eu.dnetlib.doiboost.orcidnodoi.model;
/**
* This class models the data related to external id, that are retrieved from an orcid publication
*/
public class ExternalId { public class ExternalId {
private String type; private String type;
private String value; private String value;

View File

@ -1,6 +1,10 @@
package eu.dnetlib.doiboost.orcidnodoi.model; package eu.dnetlib.doiboost.orcidnodoi.model;
/**
* This class models the data related to a publication date, that are retrieved from an orcid publication
*/
public class PublicationDate { public class PublicationDate {
private String year; private String year;
private String month; private String month;

View File

@ -4,6 +4,10 @@ package eu.dnetlib.doiboost.orcidnodoi.model;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
/**
* This class models the data that are retrieved from orcid publication
*/
public class WorkDataNoDoi implements Serializable { public class WorkDataNoDoi implements Serializable {
private String oid; private String oid;

View File

@ -22,6 +22,10 @@ import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility; import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
import eu.dnetlib.doiboost.orcidnodoi.util.Pair; import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
/**
* This class converts an orcid publication from json format to oaf
*/
public class PublicationToOaf implements Serializable { public class PublicationToOaf implements Serializable {
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class); static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
@ -119,7 +123,6 @@ public class PublicationToOaf implements Serializable {
public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) { public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
logger.debug("generatePublicationActionsFromDump ...");
if (!isValid(rootElement)) { if (!isValid(rootElement)) {
return null; return null;
Review

Is the caller expecting the null? Otherwise this would likely produce a NPE.

Is the caller expecting the `null`? Otherwise this would likely produce a NPE.
Review

yes, there is a filter for null value:
JavaRDD oafPublicationRDD = enrichedWorksRDD .map( e -> { return (Publication) publicationToOaf .generatePublicationActionsFromJson(e._2()); }) .filter(p -> p != null);

yes, there is a filter for null value: JavaRDD oafPublicationRDD = enrichedWorksRDD .map( e -> { return (Publication) publicationToOaf .generatePublicationActionsFromJson(e._2()); }) .filter(p -> p != null);
} }

View File

@ -21,6 +21,14 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
/**
* This class is used for searching from a list of publication contributors a
* specific author making a similarity check on both name and surname of the
* author with the credit name of each contributor of the list; as soon as
* the match is found (if exist) author informations are used to enrich the
* matched contribuotr inside contributors list
*/
public class AuthorMatcher { public class AuthorMatcher {
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class); private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);

View File

@ -9,6 +9,10 @@ import org.apache.commons.lang3.StringUtils;
import com.google.gson.JsonArray; import com.google.gson.JsonArray;
import com.google.gson.JsonObject; import com.google.gson.JsonObject;
/**
* Utility class
*/
public class DumpToActionsUtility { public class DumpToActionsUtility {
private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US); private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);

View File

@ -17,6 +17,10 @@ import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId;
import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate; import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
/**
* This class is used for parsing xml data with vtd parser
*/
public class XMLRecordParserNoDoi { public class XMLRecordParserNoDoi {
private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class); private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class);