This commit is contained in:
Enrico Ottonello 2021-03-31 16:25:41 +02:00
parent 91d8660982
commit 59ec5137e1
2 changed files with 55 additions and 12 deletions

View File

@ -138,6 +138,11 @@ public class SparkGenEnrichedOrcidWorks {
.longAccumulator("errorsNotFoundAuthors"); .longAccumulator("errorsNotFoundAuthors");
final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType"); final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
final LongAccumulator otherTypeFound = spark.sparkContext().longAccumulator("otherTypeFound"); final LongAccumulator otherTypeFound = spark.sparkContext().longAccumulator("otherTypeFound");
final LongAccumulator deactivatedAcc = spark.sparkContext().longAccumulator("deactivated_found");
final LongAccumulator titleNotProvidedAcc = spark
.sparkContext()
.longAccumulator("Title_not_provided_found");
final LongAccumulator noUrlAcc = spark.sparkContext().longAccumulator("no_url_found");
final PublicationToOaf publicationToOaf = new PublicationToOaf( final PublicationToOaf publicationToOaf = new PublicationToOaf(
parsedPublications, parsedPublications,
@ -147,6 +152,9 @@ public class SparkGenEnrichedOrcidWorks {
errorsNotFoundAuthors, errorsNotFoundAuthors,
errorsInvalidType, errorsInvalidType,
otherTypeFound, otherTypeFound,
deactivatedAcc,
titleNotProvidedAcc,
noUrlAcc,
dateOfCollection); dateOfCollection);
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
.map( .map(
@ -177,6 +185,9 @@ public class SparkGenEnrichedOrcidWorks {
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString()); logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
logger.info("errorsInvalidType: " + errorsInvalidType.value().toString()); logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
logger.info("otherTypeFound: " + otherTypeFound.value().toString()); logger.info("otherTypeFound: " + otherTypeFound.value().toString());
logger.info("deactivatedAcc: " + deactivatedAcc.value().toString());
logger.info("titleNotProvidedAcc: " + titleNotProvidedAcc.value().toString());
logger.info("noUrlAcc: " + noUrlAcc.value().toString());
}); });
} }
} }

View File

@ -30,11 +30,11 @@ public class PublicationToOaf implements Serializable {
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class); static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
public static final String ORCID = "ORCID";
public static final String ORCID_PID_TYPE_CLASSNAME = "Open Researcher and Contributor ID";
public final static String orcidPREFIX = "orcid_______"; public final static String orcidPREFIX = "orcid_______";
public static final String OPENAIRE_PREFIX = "openaire____"; public static final String OPENAIRE_PREFIX = "openaire____";
public static final String SEPARATOR = "::"; public static final String SEPARATOR = "::";
public static final String DEACTIVATED_NAME = "Given Names Deactivated";
public static final String DEACTIVATED_SURNAME = "Family Name Deactivated";
private String dateOfCollection = ""; private String dateOfCollection = "";
private final LongAccumulator parsedPublications; private final LongAccumulator parsedPublications;
@ -44,6 +44,9 @@ public class PublicationToOaf implements Serializable {
private final LongAccumulator errorsNotFoundAuthors; private final LongAccumulator errorsNotFoundAuthors;
private final LongAccumulator errorsInvalidType; private final LongAccumulator errorsInvalidType;
private final LongAccumulator otherTypeFound; private final LongAccumulator otherTypeFound;
private final LongAccumulator deactivatedAcc;
private final LongAccumulator titleNotProvidedAcc;
private final LongAccumulator noUrlAcc;
public PublicationToOaf( public PublicationToOaf(
LongAccumulator parsedPublications, LongAccumulator parsedPublications,
@ -53,6 +56,9 @@ public class PublicationToOaf implements Serializable {
LongAccumulator errorsNotFoundAuthors, LongAccumulator errorsNotFoundAuthors,
LongAccumulator errorsInvalidType, LongAccumulator errorsInvalidType,
LongAccumulator otherTypeFound, LongAccumulator otherTypeFound,
LongAccumulator deactivatedAcc,
LongAccumulator titleNotProvidedAcc,
LongAccumulator noUrlAcc,
String dateOfCollection) { String dateOfCollection) {
this.parsedPublications = parsedPublications; this.parsedPublications = parsedPublications;
this.enrichedPublications = enrichedPublications; this.enrichedPublications = enrichedPublications;
@ -61,6 +67,9 @@ public class PublicationToOaf implements Serializable {
this.errorsNotFoundAuthors = errorsNotFoundAuthors; this.errorsNotFoundAuthors = errorsNotFoundAuthors;
this.errorsInvalidType = errorsInvalidType; this.errorsInvalidType = errorsInvalidType;
this.otherTypeFound = otherTypeFound; this.otherTypeFound = otherTypeFound;
this.deactivatedAcc = deactivatedAcc;
this.titleNotProvidedAcc = titleNotProvidedAcc;
this.noUrlAcc = noUrlAcc;
this.dateOfCollection = dateOfCollection; this.dateOfCollection = dateOfCollection;
} }
@ -72,13 +81,18 @@ public class PublicationToOaf implements Serializable {
this.errorsNotFoundAuthors = null; this.errorsNotFoundAuthors = null;
this.errorsInvalidType = null; this.errorsInvalidType = null;
this.otherTypeFound = null; this.otherTypeFound = null;
this.deactivatedAcc = null;
this.titleNotProvidedAcc = null;
this.noUrlAcc = null;
this.dateOfCollection = null; this.dateOfCollection = null;
} }
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() { private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
{ {
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); put(
ModelConstants.ORCID,
new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
} }
}; };
@ -183,6 +197,12 @@ public class PublicationToOaf implements Serializable {
} }
return null; return null;
} }
if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) {
if (titleNotProvidedAcc != null) {
titleNotProvidedAcc.add(1);
}
return null;
}
Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
publication publication
.setTitle( .setTitle(
@ -244,8 +264,13 @@ public class PublicationToOaf implements Serializable {
if (urls != null && !urls.isEmpty()) { if (urls != null && !urls.isEmpty()) {
instance.setUrl(urls); instance.setUrl(urls);
} else { } else {
dataInfo.setInvisible(true); if (noUrlAcc != null) {
noUrlAcc.add(1);
} }
return null;
}
dataInfo.setInvisible(true);
final String pubDate = getPublicationDate(rootElement, "publicationDates"); final String pubDate = getPublicationDate(rootElement, "publicationDates");
if (StringUtils.isNotBlank(pubDate)) { if (StringUtils.isNotBlank(pubDate)) {
@ -273,7 +298,17 @@ public class PublicationToOaf implements Serializable {
// Adding authors // Adding authors
final List<Author> authors = createAuthors(rootElement); final List<Author> authors = createAuthors(rootElement);
if (authors != null && authors.size() > 0) { if (authors != null && authors.size() > 0) {
if (authors.stream().filter(a -> {
return ((Objects.nonNull(a.getName()) && a.getName().equals(DEACTIVATED_NAME)) ||
(Objects.nonNull(a.getSurname()) && a.getSurname().equals(DEACTIVATED_SURNAME)));
}).count() > 0) {
if (deactivatedAcc != null) {
deactivatedAcc.add(1);
}
return null;
} else {
publication.setAuthor(authors); publication.setAuthor(authors);
}
} else { } else {
if (authors == null) { if (authors == null) {
Gson gson = new GsonBuilder().setPrettyPrinting().create(); Gson gson = new GsonBuilder().setPrettyPrinting().create();
@ -527,24 +562,21 @@ public class PublicationToOaf implements Serializable {
private KeyValue createCollectedFrom() { private KeyValue createCollectedFrom() {
KeyValue cf = new KeyValue(); KeyValue cf = new KeyValue();
cf.setValue(ORCID); cf.setValue(ModelConstants.ORCID.toUpperCase());
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
return cf; return cf;
} }
private KeyValue createHostedBy() { private KeyValue createHostedBy() {
KeyValue hb = new KeyValue(); return ModelConstants.UNKNOWN_REPOSITORY;
hb.setValue("Unknown Repository");
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
return hb;
} }
private StructuredProperty mapAuthorId(String orcidId) { private StructuredProperty mapAuthorId(String orcidId) {
final StructuredProperty sp = new StructuredProperty(); final StructuredProperty sp = new StructuredProperty();
sp.setValue(orcidId); sp.setValue(orcidId);
final Qualifier q = new Qualifier(); final Qualifier q = new Qualifier();
q.setClassid(ORCID.toLowerCase()); q.setClassid(ModelConstants.ORCID);
q.setClassname(ORCID_PID_TYPE_CLASSNAME); q.setClassname(ModelConstants.ORCID_CLASSNAME);
q.setSchemeid(ModelConstants.DNET_PID_TYPES); q.setSchemeid(ModelConstants.DNET_PID_TYPES);
q.setSchemename(ModelConstants.DNET_PID_TYPES); q.setSchemename(ModelConstants.DNET_PID_TYPES);
sp.setQualifier(q); sp.setQualifier(q);