forked from D-Net/dnet-hadoop
add comments
This commit is contained in:
parent
538f299767
commit
9e8e7fe6ef
|
@ -3,6 +3,10 @@ package eu.dnetlib.doiboost.orcid.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class models the data that are retrieved from orcid publication
|
||||||
|
*/
|
||||||
|
|
||||||
public class AuthorData implements Serializable {
|
public class AuthorData implements Serializable {
|
||||||
|
|
||||||
private String oid;
|
private String oid;
|
||||||
|
|
|
@ -20,10 +20,14 @@ import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class write on hdfs one sequence file, the key is an orcid identifier and the
|
||||||
|
* value is an orcid publication in json format
|
||||||
|
*/
|
||||||
|
|
||||||
public class ActivitiesDumpReader {
|
public class ActivitiesDumpReader {
|
||||||
|
|
||||||
private static final int MAX_XML_WORKS_PARSED = -1;
|
private static final int MAX_XML_WORKS_PARSED = -1;
|
||||||
|
|
|
@ -12,11 +12,15 @@ import org.mortbay.log.Log;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.doiboost.orcid.OrcidDSManager;
|
import eu.dnetlib.doiboost.orcid.OrcidDSManager;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This job generates one sequence file, the key is an orcid identifier and the
|
||||||
|
* value is an orcid publication in json format
|
||||||
|
*/
|
||||||
|
|
||||||
public class GenOrcidAuthorWork extends OrcidDSManager {
|
public class GenOrcidAuthorWork extends OrcidDSManager {
|
||||||
|
|
||||||
private String activitiesFileNameTarGz;
|
private String activitiesFileNameTarGz;
|
||||||
private String outputWorksPath;
|
private String outputWorksPath;
|
||||||
// private String workingPath;
|
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException, Exception {
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
|
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
|
||||||
|
|
|
@ -22,7 +22,6 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
|
||||||
import com.google.gson.JsonElement;
|
import com.google.gson.JsonElement;
|
||||||
import com.google.gson.JsonParser;
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
|
@ -35,6 +34,10 @@ import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This spark job generates one parquet file, containing orcid publications dataset
|
||||||
|
*/
|
||||||
|
|
||||||
public class SparkGenEnrichedOrcidWorks {
|
public class SparkGenEnrichedOrcidWorks {
|
||||||
|
|
||||||
static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
|
static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
|
||||||
|
|
|
@ -6,6 +6,10 @@ import com.google.gson.JsonObject;
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class converts an object to json and viceversa
|
||||||
|
*/
|
||||||
|
|
||||||
public class JsonWriter {
|
public class JsonWriter {
|
||||||
|
|
||||||
public static String create(AuthorData authorData) {
|
public static String create(AuthorData authorData) {
|
||||||
|
|
|
@ -5,6 +5,10 @@ import java.io.Serializable;
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class models the data related to a contributor, that are retrieved from an orcid publication
|
||||||
|
*/
|
||||||
|
|
||||||
public class Contributor extends AuthorData implements Serializable {
|
public class Contributor extends AuthorData implements Serializable {
|
||||||
private String sequence;
|
private String sequence;
|
||||||
private String role;
|
private String role;
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcidnodoi.model;
|
package eu.dnetlib.doiboost.orcidnodoi.model;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class models the data related to external id, that are retrieved from an orcid publication
|
||||||
|
*/
|
||||||
|
|
||||||
public class ExternalId {
|
public class ExternalId {
|
||||||
private String type;
|
private String type;
|
||||||
private String value;
|
private String value;
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcidnodoi.model;
|
package eu.dnetlib.doiboost.orcidnodoi.model;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class models the data related to a publication date, that are retrieved from an orcid publication
|
||||||
|
*/
|
||||||
|
|
||||||
public class PublicationDate {
|
public class PublicationDate {
|
||||||
private String year;
|
private String year;
|
||||||
private String month;
|
private String month;
|
||||||
|
|
|
@ -4,6 +4,10 @@ package eu.dnetlib.doiboost.orcidnodoi.model;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class models the data that are retrieved from orcid publication
|
||||||
|
*/
|
||||||
|
|
||||||
public class WorkDataNoDoi implements Serializable {
|
public class WorkDataNoDoi implements Serializable {
|
||||||
|
|
||||||
private String oid;
|
private String oid;
|
||||||
|
|
|
@ -22,6 +22,10 @@ import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
|
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
|
import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class converts an orcid publication from json format to oaf
|
||||||
|
*/
|
||||||
|
|
||||||
public class PublicationToOaf implements Serializable {
|
public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
|
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
|
||||||
|
@ -119,7 +123,6 @@ public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
|
public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
|
||||||
|
|
||||||
logger.debug("generatePublicationActionsFromDump ...");
|
|
||||||
if (!isValid(rootElement)) {
|
if (!isValid(rootElement)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,14 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is used for searching from a list of publication contributors a
|
||||||
|
* specific author making a similarity check on both name and surname of the
|
||||||
|
* author with the credit name of each contributor of the list; as soon as
|
||||||
|
* the match is found (if exist) author informations are used to enrich the
|
||||||
|
* matched contribuotr inside contributors list
|
||||||
|
*/
|
||||||
|
|
||||||
public class AuthorMatcher {
|
public class AuthorMatcher {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
|
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
|
||||||
|
|
|
@ -9,6 +9,10 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
import com.google.gson.JsonArray;
|
import com.google.gson.JsonArray;
|
||||||
import com.google.gson.JsonObject;
|
import com.google.gson.JsonObject;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility class
|
||||||
|
*/
|
||||||
|
|
||||||
public class DumpToActionsUtility {
|
public class DumpToActionsUtility {
|
||||||
|
|
||||||
private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
|
private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
|
||||||
|
|
|
@ -17,6 +17,10 @@ import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate;
|
import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is used for parsing xml data with vtd parser
|
||||||
|
*/
|
||||||
|
|
||||||
public class XMLRecordParserNoDoi {
|
public class XMLRecordParserNoDoi {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class);
|
private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class);
|
||||||
|
|
Loading…
Reference in New Issue