From 9e8e7fe6ef24dbf6a004190cf86cbc623c8b8d21 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 15 Sep 2020 11:32:49 +0200 Subject: [PATCH] add comments --- .../java/eu/dnetlib/doiboost/orcid/model/AuthorData.java | 4 ++++ .../dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java | 6 +++++- .../dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java | 6 +++++- .../doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java | 5 ++++- .../eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java | 4 ++++ .../eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java | 4 ++++ .../eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java | 4 ++++ .../doiboost/orcidnodoi/model/PublicationDate.java | 4 ++++ .../dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java | 4 ++++ .../dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java | 5 ++++- .../doiboost/orcidnodoi/similarity/AuthorMatcher.java | 8 ++++++++ .../doiboost/orcidnodoi/util/DumpToActionsUtility.java | 4 ++++ .../doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java | 4 ++++ 13 files changed, 58 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java index 87f1f65c8d..e0624509b1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java @@ -3,6 +3,10 @@ package eu.dnetlib.doiboost.orcid.model; import java.io.Serializable; +/** + * This class models the data that are retrieved from orcid publication + */ + public class AuthorData implements Serializable { private String oid; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index 807f52972a..d852a7023e 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -20,10 +20,14 @@ import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; import eu.dnetlib.doiboost.orcid.json.JsonHelper; -import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; +/** + * This class write on hdfs one sequence file, the key is an orcid identifier and the + * value is an orcid publication in json format + */ + public class ActivitiesDumpReader { private static final int MAX_XML_WORKS_PARSED = -1; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java index 041424ba9a..d32e6d945a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -12,11 +12,15 @@ import org.mortbay.log.Log; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.doiboost.orcid.OrcidDSManager; +/** + * This job generates one sequence file, the key is an orcid identifier and the + * value is an orcid publication in json format + */ + public class GenOrcidAuthorWork extends OrcidDSManager { private String activitiesFileNameTarGz; private String outputWorksPath; -// private String workingPath; public static void main(String[] args) throws IOException, Exception { GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork(); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index dea597cbb9..b984ee2b29 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -22,7 +22,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.gson.Gson; -import com.google.gson.GsonBuilder; import com.google.gson.JsonElement; import com.google.gson.JsonParser; @@ -35,6 +34,10 @@ import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import scala.Tuple2; +/** + * This spark job generates one parquet file, containing orcid publications dataset + */ + public class SparkGenEnrichedOrcidWorks { static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java index 7f7e3a10a8..363cb13e67 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java @@ -6,6 +6,10 @@ import com.google.gson.JsonObject; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.model.WorkData; +/** + * This class converts an object to json and viceversa + */ + public class JsonWriter { public static String create(AuthorData authorData) { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java index 8a170de09b..9a8651c853 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java @@ -5,6 +5,10 @@ import java.io.Serializable; import eu.dnetlib.doiboost.orcid.model.AuthorData; +/** + * This class models the data related to a contributor, that are retrieved from an orcid publication + */ + public class Contributor extends AuthorData implements Serializable { private String sequence; private String role; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java index 865e54ae37..7fe50ce25a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java @@ -1,6 +1,10 @@ package eu.dnetlib.doiboost.orcidnodoi.model; +/** + * This class models the data related to external id, that are retrieved from an orcid publication + */ + public class ExternalId { private String type; private String value; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java index 9282a80ba2..5f794d8eb6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java @@ -1,6 +1,10 @@ package eu.dnetlib.doiboost.orcidnodoi.model; +/** + * This class models the data related to a publication date, that are retrieved from an orcid publication + */ + public class PublicationDate { private String year; private String month; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java index 5756521e7f..58f992d12a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java @@ -4,6 +4,10 @@ package eu.dnetlib.doiboost.orcidnodoi.model; import java.io.Serializable; import java.util.List; +/** + * This class models the data that are retrieved from orcid publication + */ + public class WorkDataNoDoi implements Serializable { private String oid; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index 503df67ff0..4d14084701 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -22,6 +22,10 @@ import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility; import eu.dnetlib.doiboost.orcidnodoi.util.Pair; +/** + * This class converts an orcid publication from json format to oaf + */ + public class PublicationToOaf implements Serializable { static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class); @@ -119,7 +123,6 @@ public class PublicationToOaf implements Serializable { public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) { - logger.debug("generatePublicationActionsFromDump ..."); if (!isValid(rootElement)) { return null; } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java index 1e4c38bef5..88c84ee89a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -21,6 +21,14 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +/** + * This class is used for searching from a list of publication contributors a + * specific author making a similarity check on both name and surname of the + * author with the credit name of each contributor of the list; as soon as + * the match is found (if exist) author informations are used to enrich the + * matched contribuotr inside contributors list + */ + public class AuthorMatcher { private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java index 9b9f3c8b21..ea4e58c444 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java @@ -9,6 +9,10 @@ import org.apache.commons.lang3.StringUtils; import com.google.gson.JsonArray; import com.google.gson.JsonObject; +/** + * Utility class + */ + public class DumpToActionsUtility { private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java index ae96a322f3..c5c1155515 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java @@ -17,6 +17,10 @@ import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId; import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +/** + * This class is used for parsing xml data with vtd parser + */ + public class XMLRecordParserNoDoi { private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class);