From 59ec5137e17e470868eaa46e51a51cb219b7fa50 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 31 Mar 2021 16:25:41 +0200 Subject: [PATCH] improvement related to https://issue.openaire.research-infrastructures.eu/issues/6501 --- .../SparkGenEnrichedOrcidWorks.java | 11 ++++ .../orcidnodoi/oaf/PublicationToOaf.java | 56 +++++++++++++++---- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index cda08939c..5bcec7224 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -138,6 +138,11 @@ public class SparkGenEnrichedOrcidWorks { .longAccumulator("errorsNotFoundAuthors"); final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType"); final LongAccumulator otherTypeFound = spark.sparkContext().longAccumulator("otherTypeFound"); + final LongAccumulator deactivatedAcc = spark.sparkContext().longAccumulator("deactivated_found"); + final LongAccumulator titleNotProvidedAcc = spark + .sparkContext() + .longAccumulator("Title_not_provided_found"); + final LongAccumulator noUrlAcc = spark.sparkContext().longAccumulator("no_url_found"); final PublicationToOaf publicationToOaf = new PublicationToOaf( parsedPublications, @@ -147,6 +152,9 @@ public class SparkGenEnrichedOrcidWorks { errorsNotFoundAuthors, errorsInvalidType, otherTypeFound, + deactivatedAcc, + titleNotProvidedAcc, + noUrlAcc, dateOfCollection); JavaRDD oafPublicationRDD = enrichedWorksRDD .map( @@ -177,6 +185,9 @@ public class SparkGenEnrichedOrcidWorks { logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString()); logger.info("errorsInvalidType: " + errorsInvalidType.value().toString()); logger.info("otherTypeFound: " + otherTypeFound.value().toString()); + logger.info("deactivatedAcc: " + deactivatedAcc.value().toString()); + logger.info("titleNotProvidedAcc: " + titleNotProvidedAcc.value().toString()); + logger.info("noUrlAcc: " + noUrlAcc.value().toString()); }); } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index 777f3fa46..5c3236222 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -30,11 +30,11 @@ public class PublicationToOaf implements Serializable { static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class); - public static final String ORCID = "ORCID"; - public static final String ORCID_PID_TYPE_CLASSNAME = "Open Researcher and Contributor ID"; public final static String orcidPREFIX = "orcid_______"; public static final String OPENAIRE_PREFIX = "openaire____"; public static final String SEPARATOR = "::"; + public static final String DEACTIVATED_NAME = "Given Names Deactivated"; + public static final String DEACTIVATED_SURNAME = "Family Name Deactivated"; private String dateOfCollection = ""; private final LongAccumulator parsedPublications; @@ -44,6 +44,9 @@ public class PublicationToOaf implements Serializable { private final LongAccumulator errorsNotFoundAuthors; private final LongAccumulator errorsInvalidType; private final LongAccumulator otherTypeFound; + private final LongAccumulator deactivatedAcc; + private final LongAccumulator titleNotProvidedAcc; + private final LongAccumulator noUrlAcc; public PublicationToOaf( LongAccumulator parsedPublications, @@ -53,6 +56,9 @@ public class PublicationToOaf implements Serializable { LongAccumulator errorsNotFoundAuthors, LongAccumulator errorsInvalidType, LongAccumulator otherTypeFound, + LongAccumulator deactivatedAcc, + LongAccumulator titleNotProvidedAcc, + LongAccumulator noUrlAcc, String dateOfCollection) { this.parsedPublications = parsedPublications; this.enrichedPublications = enrichedPublications; @@ -61,6 +67,9 @@ public class PublicationToOaf implements Serializable { this.errorsNotFoundAuthors = errorsNotFoundAuthors; this.errorsInvalidType = errorsInvalidType; this.otherTypeFound = otherTypeFound; + this.deactivatedAcc = deactivatedAcc; + this.titleNotProvidedAcc = titleNotProvidedAcc; + this.noUrlAcc = noUrlAcc; this.dateOfCollection = dateOfCollection; } @@ -72,13 +81,18 @@ public class PublicationToOaf implements Serializable { this.errorsNotFoundAuthors = null; this.errorsInvalidType = null; this.otherTypeFound = null; + this.deactivatedAcc = null; + this.titleNotProvidedAcc = null; + this.noUrlAcc = null; this.dateOfCollection = null; } private static Map> datasources = new HashMap>() { { - put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); + put( + ModelConstants.ORCID, + new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid")); } }; @@ -183,6 +197,12 @@ public class PublicationToOaf implements Serializable { } return null; } + if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) { + if (titleNotProvidedAcc != null) { + titleNotProvidedAcc.add(1); + } + return null; + } Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); publication .setTitle( @@ -244,9 +264,14 @@ public class PublicationToOaf implements Serializable { if (urls != null && !urls.isEmpty()) { instance.setUrl(urls); } else { - dataInfo.setInvisible(true); + if (noUrlAcc != null) { + noUrlAcc.add(1); + } + return null; } + dataInfo.setInvisible(true); + final String pubDate = getPublicationDate(rootElement, "publicationDates"); if (StringUtils.isNotBlank(pubDate)) { instance.setDateofacceptance(mapStringField(pubDate, null)); @@ -273,7 +298,17 @@ public class PublicationToOaf implements Serializable { // Adding authors final List authors = createAuthors(rootElement); if (authors != null && authors.size() > 0) { - publication.setAuthor(authors); + if (authors.stream().filter(a -> { + return ((Objects.nonNull(a.getName()) && a.getName().equals(DEACTIVATED_NAME)) || + (Objects.nonNull(a.getSurname()) && a.getSurname().equals(DEACTIVATED_SURNAME))); + }).count() > 0) { + if (deactivatedAcc != null) { + deactivatedAcc.add(1); + } + return null; + } else { + publication.setAuthor(authors); + } } else { if (authors == null) { Gson gson = new GsonBuilder().setPrettyPrinting().create(); @@ -527,24 +562,21 @@ public class PublicationToOaf implements Serializable { private KeyValue createCollectedFrom() { KeyValue cf = new KeyValue(); - cf.setValue(ORCID); + cf.setValue(ModelConstants.ORCID.toUpperCase()); cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); return cf; } private KeyValue createHostedBy() { - KeyValue hb = new KeyValue(); - hb.setValue("Unknown Repository"); - hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c"); - return hb; + return ModelConstants.UNKNOWN_REPOSITORY; } private StructuredProperty mapAuthorId(String orcidId) { final StructuredProperty sp = new StructuredProperty(); sp.setValue(orcidId); final Qualifier q = new Qualifier(); - q.setClassid(ORCID.toLowerCase()); - q.setClassname(ORCID_PID_TYPE_CLASSNAME); + q.setClassid(ModelConstants.ORCID); + q.setClassname(ModelConstants.ORCID_CLASSNAME); q.setSchemeid(ModelConstants.DNET_PID_TYPES); q.setSchemename(ModelConstants.DNET_PID_TYPES); sp.setQualifier(q);