This commit is contained in:
Enrico Ottonello 2021-03-31 16:25:41 +02:00
parent 91d8660982
commit 59ec5137e1
2 changed files with 55 additions and 12 deletions

View File

@ -138,6 +138,11 @@ public class SparkGenEnrichedOrcidWorks {
.longAccumulator("errorsNotFoundAuthors");
final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
final LongAccumulator otherTypeFound = spark.sparkContext().longAccumulator("otherTypeFound");
final LongAccumulator deactivatedAcc = spark.sparkContext().longAccumulator("deactivated_found");
final LongAccumulator titleNotProvidedAcc = spark
.sparkContext()
.longAccumulator("Title_not_provided_found");
final LongAccumulator noUrlAcc = spark.sparkContext().longAccumulator("no_url_found");
final PublicationToOaf publicationToOaf = new PublicationToOaf(
parsedPublications,
@ -147,6 +152,9 @@ public class SparkGenEnrichedOrcidWorks {
errorsNotFoundAuthors,
errorsInvalidType,
otherTypeFound,
deactivatedAcc,
titleNotProvidedAcc,
noUrlAcc,
dateOfCollection);
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
.map(
@ -177,6 +185,9 @@ public class SparkGenEnrichedOrcidWorks {
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
logger.info("otherTypeFound: " + otherTypeFound.value().toString());
logger.info("deactivatedAcc: " + deactivatedAcc.value().toString());
logger.info("titleNotProvidedAcc: " + titleNotProvidedAcc.value().toString());
logger.info("noUrlAcc: " + noUrlAcc.value().toString());
});
}
}

View File

@ -30,11 +30,11 @@ public class PublicationToOaf implements Serializable {
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
public static final String ORCID = "ORCID";
public static final String ORCID_PID_TYPE_CLASSNAME = "Open Researcher and Contributor ID";
public final static String orcidPREFIX = "orcid_______";
public static final String OPENAIRE_PREFIX = "openaire____";
public static final String SEPARATOR = "::";
public static final String DEACTIVATED_NAME = "Given Names Deactivated";
public static final String DEACTIVATED_SURNAME = "Family Name Deactivated";
private String dateOfCollection = "";
private final LongAccumulator parsedPublications;
@ -44,6 +44,9 @@ public class PublicationToOaf implements Serializable {
private final LongAccumulator errorsNotFoundAuthors;
private final LongAccumulator errorsInvalidType;
private final LongAccumulator otherTypeFound;
private final LongAccumulator deactivatedAcc;
private final LongAccumulator titleNotProvidedAcc;
private final LongAccumulator noUrlAcc;
public PublicationToOaf(
LongAccumulator parsedPublications,
@ -53,6 +56,9 @@ public class PublicationToOaf implements Serializable {
LongAccumulator errorsNotFoundAuthors,
LongAccumulator errorsInvalidType,
LongAccumulator otherTypeFound,
LongAccumulator deactivatedAcc,
LongAccumulator titleNotProvidedAcc,
LongAccumulator noUrlAcc,
String dateOfCollection) {
this.parsedPublications = parsedPublications;
this.enrichedPublications = enrichedPublications;
@ -61,6 +67,9 @@ public class PublicationToOaf implements Serializable {
this.errorsNotFoundAuthors = errorsNotFoundAuthors;
this.errorsInvalidType = errorsInvalidType;
this.otherTypeFound = otherTypeFound;
this.deactivatedAcc = deactivatedAcc;
this.titleNotProvidedAcc = titleNotProvidedAcc;
this.noUrlAcc = noUrlAcc;
this.dateOfCollection = dateOfCollection;
}
@ -72,13 +81,18 @@ public class PublicationToOaf implements Serializable {
this.errorsNotFoundAuthors = null;
this.errorsInvalidType = null;
this.otherTypeFound = null;
this.deactivatedAcc = null;
this.titleNotProvidedAcc = null;
this.noUrlAcc = null;
this.dateOfCollection = null;
}
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
{
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
put(
ModelConstants.ORCID,
new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
}
};
@ -183,6 +197,12 @@ public class PublicationToOaf implements Serializable {
}
return null;
}
if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) {
if (titleNotProvidedAcc != null) {
titleNotProvidedAcc.add(1);
}
return null;
}
Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
publication
.setTitle(
@ -244,9 +264,14 @@ public class PublicationToOaf implements Serializable {
if (urls != null && !urls.isEmpty()) {
instance.setUrl(urls);
} else {
dataInfo.setInvisible(true);
if (noUrlAcc != null) {
noUrlAcc.add(1);
}
return null;
}
dataInfo.setInvisible(true);
final String pubDate = getPublicationDate(rootElement, "publicationDates");
if (StringUtils.isNotBlank(pubDate)) {
instance.setDateofacceptance(mapStringField(pubDate, null));
@ -273,7 +298,17 @@ public class PublicationToOaf implements Serializable {
// Adding authors
final List<Author> authors = createAuthors(rootElement);
if (authors != null && authors.size() > 0) {
publication.setAuthor(authors);
if (authors.stream().filter(a -> {
return ((Objects.nonNull(a.getName()) && a.getName().equals(DEACTIVATED_NAME)) ||
(Objects.nonNull(a.getSurname()) && a.getSurname().equals(DEACTIVATED_SURNAME)));
}).count() > 0) {
if (deactivatedAcc != null) {
deactivatedAcc.add(1);
}
return null;
} else {
publication.setAuthor(authors);
}
} else {
if (authors == null) {
Gson gson = new GsonBuilder().setPrettyPrinting().create();
@ -527,24 +562,21 @@ public class PublicationToOaf implements Serializable {
private KeyValue createCollectedFrom() {
KeyValue cf = new KeyValue();
cf.setValue(ORCID);
cf.setValue(ModelConstants.ORCID.toUpperCase());
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
return cf;
}
private KeyValue createHostedBy() {
KeyValue hb = new KeyValue();
hb.setValue("Unknown Repository");
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
return hb;
return ModelConstants.UNKNOWN_REPOSITORY;
}
private StructuredProperty mapAuthorId(String orcidId) {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(orcidId);
final Qualifier q = new Qualifier();
q.setClassid(ORCID.toLowerCase());
q.setClassname(ORCID_PID_TYPE_CLASSNAME);
q.setClassid(ModelConstants.ORCID);
q.setClassname(ModelConstants.ORCID_CLASSNAME);
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
q.setSchemename(ModelConstants.DNET_PID_TYPES);
sp.setQualifier(q);