forked from D-Net/dnet-hadoop
improvement related to https://issue.openaire.research-infrastructures.eu/issues/6501
This commit is contained in:
parent
91d8660982
commit
59ec5137e1
|
@ -138,6 +138,11 @@ public class SparkGenEnrichedOrcidWorks {
|
||||||
.longAccumulator("errorsNotFoundAuthors");
|
.longAccumulator("errorsNotFoundAuthors");
|
||||||
final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
|
final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
|
||||||
final LongAccumulator otherTypeFound = spark.sparkContext().longAccumulator("otherTypeFound");
|
final LongAccumulator otherTypeFound = spark.sparkContext().longAccumulator("otherTypeFound");
|
||||||
|
final LongAccumulator deactivatedAcc = spark.sparkContext().longAccumulator("deactivated_found");
|
||||||
|
final LongAccumulator titleNotProvidedAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("Title_not_provided_found");
|
||||||
|
final LongAccumulator noUrlAcc = spark.sparkContext().longAccumulator("no_url_found");
|
||||||
|
|
||||||
final PublicationToOaf publicationToOaf = new PublicationToOaf(
|
final PublicationToOaf publicationToOaf = new PublicationToOaf(
|
||||||
parsedPublications,
|
parsedPublications,
|
||||||
|
@ -147,6 +152,9 @@ public class SparkGenEnrichedOrcidWorks {
|
||||||
errorsNotFoundAuthors,
|
errorsNotFoundAuthors,
|
||||||
errorsInvalidType,
|
errorsInvalidType,
|
||||||
otherTypeFound,
|
otherTypeFound,
|
||||||
|
deactivatedAcc,
|
||||||
|
titleNotProvidedAcc,
|
||||||
|
noUrlAcc,
|
||||||
dateOfCollection);
|
dateOfCollection);
|
||||||
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
|
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
|
||||||
.map(
|
.map(
|
||||||
|
@ -177,6 +185,9 @@ public class SparkGenEnrichedOrcidWorks {
|
||||||
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
|
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
|
||||||
logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
|
logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
|
||||||
logger.info("otherTypeFound: " + otherTypeFound.value().toString());
|
logger.info("otherTypeFound: " + otherTypeFound.value().toString());
|
||||||
|
logger.info("deactivatedAcc: " + deactivatedAcc.value().toString());
|
||||||
|
logger.info("titleNotProvidedAcc: " + titleNotProvidedAcc.value().toString());
|
||||||
|
logger.info("noUrlAcc: " + noUrlAcc.value().toString());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,11 +30,11 @@ public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
|
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
|
||||||
|
|
||||||
public static final String ORCID = "ORCID";
|
|
||||||
public static final String ORCID_PID_TYPE_CLASSNAME = "Open Researcher and Contributor ID";
|
|
||||||
public final static String orcidPREFIX = "orcid_______";
|
public final static String orcidPREFIX = "orcid_______";
|
||||||
public static final String OPENAIRE_PREFIX = "openaire____";
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
||||||
public static final String SEPARATOR = "::";
|
public static final String SEPARATOR = "::";
|
||||||
|
public static final String DEACTIVATED_NAME = "Given Names Deactivated";
|
||||||
|
public static final String DEACTIVATED_SURNAME = "Family Name Deactivated";
|
||||||
|
|
||||||
private String dateOfCollection = "";
|
private String dateOfCollection = "";
|
||||||
private final LongAccumulator parsedPublications;
|
private final LongAccumulator parsedPublications;
|
||||||
|
@ -44,6 +44,9 @@ public class PublicationToOaf implements Serializable {
|
||||||
private final LongAccumulator errorsNotFoundAuthors;
|
private final LongAccumulator errorsNotFoundAuthors;
|
||||||
private final LongAccumulator errorsInvalidType;
|
private final LongAccumulator errorsInvalidType;
|
||||||
private final LongAccumulator otherTypeFound;
|
private final LongAccumulator otherTypeFound;
|
||||||
|
private final LongAccumulator deactivatedAcc;
|
||||||
|
private final LongAccumulator titleNotProvidedAcc;
|
||||||
|
private final LongAccumulator noUrlAcc;
|
||||||
|
|
||||||
public PublicationToOaf(
|
public PublicationToOaf(
|
||||||
LongAccumulator parsedPublications,
|
LongAccumulator parsedPublications,
|
||||||
|
@ -53,6 +56,9 @@ public class PublicationToOaf implements Serializable {
|
||||||
LongAccumulator errorsNotFoundAuthors,
|
LongAccumulator errorsNotFoundAuthors,
|
||||||
LongAccumulator errorsInvalidType,
|
LongAccumulator errorsInvalidType,
|
||||||
LongAccumulator otherTypeFound,
|
LongAccumulator otherTypeFound,
|
||||||
|
LongAccumulator deactivatedAcc,
|
||||||
|
LongAccumulator titleNotProvidedAcc,
|
||||||
|
LongAccumulator noUrlAcc,
|
||||||
String dateOfCollection) {
|
String dateOfCollection) {
|
||||||
this.parsedPublications = parsedPublications;
|
this.parsedPublications = parsedPublications;
|
||||||
this.enrichedPublications = enrichedPublications;
|
this.enrichedPublications = enrichedPublications;
|
||||||
|
@ -61,6 +67,9 @@ public class PublicationToOaf implements Serializable {
|
||||||
this.errorsNotFoundAuthors = errorsNotFoundAuthors;
|
this.errorsNotFoundAuthors = errorsNotFoundAuthors;
|
||||||
this.errorsInvalidType = errorsInvalidType;
|
this.errorsInvalidType = errorsInvalidType;
|
||||||
this.otherTypeFound = otherTypeFound;
|
this.otherTypeFound = otherTypeFound;
|
||||||
|
this.deactivatedAcc = deactivatedAcc;
|
||||||
|
this.titleNotProvidedAcc = titleNotProvidedAcc;
|
||||||
|
this.noUrlAcc = noUrlAcc;
|
||||||
this.dateOfCollection = dateOfCollection;
|
this.dateOfCollection = dateOfCollection;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,13 +81,18 @@ public class PublicationToOaf implements Serializable {
|
||||||
this.errorsNotFoundAuthors = null;
|
this.errorsNotFoundAuthors = null;
|
||||||
this.errorsInvalidType = null;
|
this.errorsInvalidType = null;
|
||||||
this.otherTypeFound = null;
|
this.otherTypeFound = null;
|
||||||
|
this.deactivatedAcc = null;
|
||||||
|
this.titleNotProvidedAcc = null;
|
||||||
|
this.noUrlAcc = null;
|
||||||
this.dateOfCollection = null;
|
this.dateOfCollection = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
||||||
|
|
||||||
{
|
{
|
||||||
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
put(
|
||||||
|
ModelConstants.ORCID,
|
||||||
|
new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -183,6 +197,12 @@ public class PublicationToOaf implements Serializable {
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) {
|
||||||
|
if (titleNotProvidedAcc != null) {
|
||||||
|
titleNotProvidedAcc.add(1);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||||
publication
|
publication
|
||||||
.setTitle(
|
.setTitle(
|
||||||
|
@ -244,8 +264,13 @@ public class PublicationToOaf implements Serializable {
|
||||||
if (urls != null && !urls.isEmpty()) {
|
if (urls != null && !urls.isEmpty()) {
|
||||||
instance.setUrl(urls);
|
instance.setUrl(urls);
|
||||||
} else {
|
} else {
|
||||||
dataInfo.setInvisible(true);
|
if (noUrlAcc != null) {
|
||||||
|
noUrlAcc.add(1);
|
||||||
}
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
dataInfo.setInvisible(true);
|
||||||
|
|
||||||
final String pubDate = getPublicationDate(rootElement, "publicationDates");
|
final String pubDate = getPublicationDate(rootElement, "publicationDates");
|
||||||
if (StringUtils.isNotBlank(pubDate)) {
|
if (StringUtils.isNotBlank(pubDate)) {
|
||||||
|
@ -273,7 +298,17 @@ public class PublicationToOaf implements Serializable {
|
||||||
// Adding authors
|
// Adding authors
|
||||||
final List<Author> authors = createAuthors(rootElement);
|
final List<Author> authors = createAuthors(rootElement);
|
||||||
if (authors != null && authors.size() > 0) {
|
if (authors != null && authors.size() > 0) {
|
||||||
|
if (authors.stream().filter(a -> {
|
||||||
|
return ((Objects.nonNull(a.getName()) && a.getName().equals(DEACTIVATED_NAME)) ||
|
||||||
|
(Objects.nonNull(a.getSurname()) && a.getSurname().equals(DEACTIVATED_SURNAME)));
|
||||||
|
}).count() > 0) {
|
||||||
|
if (deactivatedAcc != null) {
|
||||||
|
deactivatedAcc.add(1);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
publication.setAuthor(authors);
|
publication.setAuthor(authors);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
if (authors == null) {
|
if (authors == null) {
|
||||||
Gson gson = new GsonBuilder().setPrettyPrinting().create();
|
Gson gson = new GsonBuilder().setPrettyPrinting().create();
|
||||||
|
@ -527,24 +562,21 @@ public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
private KeyValue createCollectedFrom() {
|
private KeyValue createCollectedFrom() {
|
||||||
KeyValue cf = new KeyValue();
|
KeyValue cf = new KeyValue();
|
||||||
cf.setValue(ORCID);
|
cf.setValue(ModelConstants.ORCID.toUpperCase());
|
||||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
|
||||||
return cf;
|
return cf;
|
||||||
}
|
}
|
||||||
|
|
||||||
private KeyValue createHostedBy() {
|
private KeyValue createHostedBy() {
|
||||||
KeyValue hb = new KeyValue();
|
return ModelConstants.UNKNOWN_REPOSITORY;
|
||||||
hb.setValue("Unknown Repository");
|
|
||||||
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
|
|
||||||
return hb;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private StructuredProperty mapAuthorId(String orcidId) {
|
private StructuredProperty mapAuthorId(String orcidId) {
|
||||||
final StructuredProperty sp = new StructuredProperty();
|
final StructuredProperty sp = new StructuredProperty();
|
||||||
sp.setValue(orcidId);
|
sp.setValue(orcidId);
|
||||||
final Qualifier q = new Qualifier();
|
final Qualifier q = new Qualifier();
|
||||||
q.setClassid(ORCID.toLowerCase());
|
q.setClassid(ModelConstants.ORCID);
|
||||||
q.setClassname(ORCID_PID_TYPE_CLASSNAME);
|
q.setClassname(ModelConstants.ORCID_CLASSNAME);
|
||||||
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||||
q.setSchemename(ModelConstants.DNET_PID_TYPES);
|
q.setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||||
sp.setQualifier(q);
|
sp.setQualifier(q);
|
||||||
|
|
Loading…
Reference in New Issue