From 5525f57ec8f9ef07d74ab30c54ab8d39e924d413 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 1 Jul 2020 18:36:14 +0200 Subject: [PATCH] converter from orcid work json to oaf --- .../orcidnodoi/oaf/OrcidWorkToOAF.java | 420 +++++++++++++++++ .../orcidnodoi/proto/ProtoWriter.java | 427 ------------------ .../orcidnodoi/util/DumpToActionsUtility.java | 107 +++++ .../doiboost/orcidnodoi/util/Pair.java | 30 ++ .../orcidnodoi/mappings/typologies.json | 41 ++ 5 files changed, 598 insertions(+), 427 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java delete mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java new file mode 100644 index 000000000..673abb407 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/OrcidWorkToOAF.java @@ -0,0 +1,420 @@ + +package eu.dnetlib.doiboost.orcidnodoi.oaf; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import eu.dnetlib.dhp.common.PacePerson; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.DHPUtils; +import eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks; +import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility; +import eu.dnetlib.doiboost.orcidnodoi.util.Pair; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.*; +import java.util.stream.Collectors; + +import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*; + +public class OrcidWorkToOAF { + + static Logger logger = LoggerFactory.getLogger(OrcidWorkToOAF.class); + + public static final String ORCID = "ORCID"; + public final static String orcidPREFIX = "orcid_______"; + public static final String OPENAIRE_PREFIX = "openaire____"; + public static final String SEPARATOR = "::"; + + private static Map> datasources = new HashMap>() { + + { + put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); + + } + }; + + // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname + private static Map> externalIds = new HashMap>() { + + { + put("ark".toLowerCase(), new Pair<>("ark", "ark")); + put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv")); + put("pmc".toLowerCase(), new Pair<>("pmc", "pmc")); + put("pmid".toLowerCase(), new Pair<>("pmid", "pmid")); + put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid")); + put("urn".toLowerCase(), new Pair<>("urn", "urn")); + } + }; + + static Map> typologiesMapping; + + static { + try { + final String tt = IOUtils.toString(OrcidWorkToOAF.class.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json")); + typologiesMapping = new Gson().fromJson(tt, Map.class); + } catch (final Exception e) { + logger.error("loading typologies", e); + } + } + + public static final String PID_TYPES = "dnet:pid_types"; + + public static Oaf generatePublicationActionsFromDump(final JsonObject rootElement, final String setName) { + + if (!isValid(rootElement/*, context*/)) { return null; } + + Publication publication = new Publication(); + + final DataInfo dataInfo = new DataInfo(); + dataInfo.setDeletedbyinference(false); + dataInfo.setInferred(false); + dataInfo.setTrust("0.9"); + dataInfo.setProvenanceaction( + mapQualifier( + "sysimport:actionset:orcidworks-no-doi", + "sysimport:actionset:orcidworks-no-doi", + "dnet:provenanceActions", + "dnet:provenanceActions")); + publication.setDataInfo(dataInfo); + + publication.setLastupdatetimestamp(new Date().getTime()); + + publication.setDateofcollection("2019-10-22"); + publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601()); + + // Adding external ids + externalIds.keySet().stream() + .forEach(jsonExtId -> { + final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue(); + final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey(); + final String extId = getStringValue(rootElement, jsonExtId); + if (StringUtils.isNotBlank(extId)) { + publication.getExternalReference().add( + convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types")); + } + }); + + // Adding source +// final String source = getStringValue(rootElement, "source"); +// if (StringUtils.isNotBlank(source)) { +// metadata.addSource(StringField.newBuilder().setValue(source).build()); +// } + + // Adding titles + final List titles = createRepeatedField(rootElement, "titles"); + if (titles==null || titles.isEmpty()) { +// context.incrementCounter("filtered", "title_not_found", 1); + return null; + } + Qualifier q = mapQualifier("main title","main title","dnet:dataCite_title","dnet:dataCite_title"); + publication.setTitle( + titles + .stream() + .map(t -> { + return mapStructuredProperty(t, q, null); + }) + .collect(Collectors.toList())); + // Adding identifier + final String id = getStringValue(rootElement, "id"); + String sourceId = null; + if (id != null) { + publication.setOriginalId(Arrays.asList(id)); + sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(id.toLowerCase())); + } else { + String mergedTitle = titles.stream().map(Object::toString).collect(Collectors.joining(",")); + sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(mergedTitle.toLowerCase())); + } + publication.setId(sourceId); + + // Adding relevant date + settingRelevantDate(rootElement, publication, "publication_date", "issued", true); + + // Adding collectedfrom + publication.setCollectedfrom(Arrays.asList(createCollectedFrom())); + + // Adding type + final String type = getStringValue(rootElement, "type"); + String cobjValue = ""; + if (StringUtils.isNotBlank(type)) { + publication.setResourcetype(mapQualifier(type, type, "dnet:dataCite_resource", "dnet:dataCite_resource")); + + final String typeValue = typologiesMapping.get(type).get("value"); + cobjValue = typologiesMapping.get(type).get("cobj"); + final Instance instance = new Instance(); + + // Adding hostedby + instance.setHostedby(createHostedBy()); + + // Adding url + final List urls = createRepeatedField(rootElement, "urls"); + if (urls!=null && !urls.isEmpty()) { + instance.setUrl(urls); + } + + final String pubDate = getPublicationDate(rootElement, "publication_date"); + if (StringUtils.isNotBlank(pubDate)) { + instance.setDateofacceptance(mapStringField(pubDate, null)); + } + + instance.setCollectedfrom(createCollectedFrom()); + + // Adding accessright + instance.setAccessright(mapQualifier("UNKNOWN", "UNKNOWN", "dnet:access_modes", "dnet:access_modes")); + + // Adding type + instance.setInstancetype(mapQualifier(cobjValue, typeValue, "dnet:publication_resource", "dnet:publication_resource")); + + publication.setInstance(Arrays.asList(instance)); + } else { +// context.incrementCounter("filtered", "type_not_found", 1); + return null; + } + + // Adding authors + final List authors = createAuthors(rootElement); + if (authors != null && authors.size() > 0) { + publication.setAuthor(authors); + } else { +// context.incrementCounter("filtered", "author_not_found", 1); + return null; + } + String classValue = getDefaultResulttype(cobjValue); + publication.setResulttype(mapQualifier(classValue, classValue,"dnet:result_typologies", "dnet:result_typologies")); + return publication; + } + + public static List createAuthors(final JsonObject root) { + + final String authorsJSONFieldName = "authors"; + + if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) { + + final List authors = new ArrayList<>(); + final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName); + int firstCounter = 0; + int defaultCounter = 0; + int rank = 1; + int currentRank = 0; + + for (final JsonElement item : jsonAuthors) { + final JsonObject jsonAuthor = item.getAsJsonObject(); + final Author author = new Author(); + if (item.isJsonObject()) { + final String surname = getStringValue(jsonAuthor, "surname"); + final String name = getStringValue(jsonAuthor, "name"); + final String oid = getStringValue(jsonAuthor, "oid"); + final String seq = getStringValue(jsonAuthor, "seq"); + if (StringUtils.isNotBlank(seq)) { + if (seq.equals("first")) { + firstCounter += 1; + rank = firstCounter; + + } else if (seq.equals("additional")) { + rank = currentRank + 1; + } else { + defaultCounter += 1; + rank = defaultCounter; + } + } + + if (StringUtils.isNotBlank(oid)) { + author.setPid(Arrays.asList(mapAuthorId(oid))); + author.setFullname(name + " " + surname); + if (StringUtils.isNotBlank(name)) { + author.setName(name); + } + if (StringUtils.isNotBlank(surname)) { + author.setSurname(surname); + } + } else { + String fullname = ""; + if (StringUtils.isNotBlank(name)) { + fullname = name; + } else { + if (StringUtils.isNotBlank(surname)) { + fullname = surname; + } + } + PacePerson p = new PacePerson(fullname, false); + if (p.isAccurate()) { + author.setName(p.getNormalisedFirstName()); + author.setSurname(p.getNormalisedSurname()); + author.setFullname(p.getNormalisedFullname()); + } + else { + author.setFullname(fullname); + } + } + } + author.setRank(rank); + authors.add(author); + currentRank = rank; + } + return authors; + + } + return null; + } + + private static List createRepeatedField(final JsonObject rootElement, final String fieldName) { + if (!rootElement.has(fieldName)) { return null; } + if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; } + if (rootElement.get(fieldName).isJsonArray()) { + if (!isValidJsonArray(rootElement, fieldName)) { return null; } + return getArrayValues(rootElement, fieldName); + } else { + String field = getStringValue(rootElement, fieldName); + return Arrays.asList(cleanField(field)); + } + } + + private static String cleanField(String value) { + if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') { + value = value.substring(1, value.length() - 1); + } + return value; + } + + private static void settingRelevantDate(final JsonObject rootElement, + final Publication publication, + final String jsonKey, + final String dictionaryKey, + final boolean addToDateOfAcceptance) { + + final String pubDate = getPublicationDate(rootElement, "publication_date"); + if (StringUtils.isNotBlank(pubDate)) { + if (addToDateOfAcceptance) { + publication.setDateofacceptance(mapStringField(pubDate, null)); + } + Qualifier q = mapQualifier(dictionaryKey,dictionaryKey,"dnet:dataCite_date","dnet:dataCite_date"); + publication.setRelevantdate( + Arrays.asList(pubDate) + .stream() + .map(r -> { + return mapStructuredProperty(r, q, null); + }) + .collect(Collectors.toList())); + } + } + + private static String getPublicationDate(final JsonObject rootElement, + final String jsonKey) { + + final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); + if (pubDateJson == null) { return null; } + final String year = getStringValue(pubDateJson, "year"); + final String month = getStringValue(pubDateJson, "month"); + final String day = getStringValue(pubDateJson, "day"); + + if (StringUtils.isBlank(year)) { return null; } + String pubDate = "".concat(year); + if (StringUtils.isNotBlank(month)) { + pubDate = pubDate.concat("-" + month); + if (StringUtils.isNotBlank(day)) { + pubDate = pubDate.concat("-" + day); + } else { + pubDate += "-01"; + } + } else { + pubDate += "-01-01"; + } + if (isValidDate(pubDate)) { return pubDate; } + return null; + } + + protected static boolean isValid(final JsonObject rootElement/*, final Reporter context*/) { + + final String type = getStringValue(rootElement, "type"); + if (!typologiesMapping.containsKey(type)) { +// context.incrementCounter("filtered", "unknowntype_" + type, 1); + return false; + } + + if (!isValidJsonArray(rootElement, "titles")) { +// context.incrementCounter("filtered", "invalid_title", 1); + return false; + } + return true; + } + + private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) { + if (!rootElement.has(fieldName)) { return false; } + final JsonElement jsonElement = rootElement.get(fieldName); + if (jsonElement.isJsonNull()) { return false; } + if (jsonElement.isJsonArray()) { + final JsonArray jsonArray = jsonElement.getAsJsonArray(); + if (jsonArray.isJsonNull()) { return false; } + if (jsonArray.get(0).isJsonNull()) { return false; } + } + return true; + } + + private static Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) { + final Qualifier qualifier = new Qualifier(); + qualifier.setClassid(classId); + qualifier.setClassname(className); + qualifier.setSchemeid(schemeId); + qualifier.setSchemename(schemeName); + return qualifier; + } + + private static ExternalReference convertExtRef(String extId, String classId, String className, String schemeId, String schemeName) { + ExternalReference ex = new ExternalReference(); + ex.setRefidentifier(extId); + ex.setQualifier(mapQualifier(classId, className, schemeId, schemeName )); + return ex; + } + + private static StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) { + if (value == null | StringUtils.isBlank(value)) { + return null; + } + + final StructuredProperty structuredProperty = new StructuredProperty(); + structuredProperty.setValue(value); + structuredProperty.setQualifier(qualifier); + structuredProperty.setDataInfo(dataInfo); + return structuredProperty; + } + + private static Field mapStringField(String value, DataInfo dataInfo) { + if (value == null || StringUtils.isBlank(value)) { + return null; + } + + final Field stringField = new Field<>(); + stringField.setValue(value); + stringField.setDataInfo(dataInfo); + return stringField; + } + + private static KeyValue createCollectedFrom() { + KeyValue cf = new KeyValue(); + cf.setValue(ORCID); + cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); + return cf; + } + + private static KeyValue createHostedBy() { + KeyValue hb = new KeyValue(); + hb.setValue("Unknown Repository"); + hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c"); + return hb; + } + + private static StructuredProperty mapAuthorId(String orcidId) { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(orcidId); + final Qualifier q = new Qualifier(); + q.setClassid("ORCID"); + q.setClassname("ORCID"); + sp.setQualifier(q); + return sp; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java deleted file mode 100644 index 01b172359..000000000 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java +++ /dev/null @@ -1,427 +0,0 @@ - -package eu.dnetlib.doiboost.orcidnodoi.proto; - -public class ProtoWriter { - -} -// -//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues; -//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype; -//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier; -//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue; -//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate; -// -//import java.io.IOException; -//import java.io.InputStream; -//import java.util.ArrayList; -//import java.util.HashMap; -//import java.util.List; -//import java.util.Map; -// -//import org.apache.commons.io.IOUtils; -//import org.apache.commons.lang3.StringUtils; -// -//import com.google.gson.Gson; -//import com.google.gson.JsonArray; -//import com.google.gson.JsonElement; -//import com.google.gson.JsonObject; -//import com.googlecode.protobuf.format.JsonFormat; -// -//import eu.dnetlib.actionmanager.actions.ActionFactory; -//import eu.dnetlib.actionmanager.actions.AtomicAction; -//import eu.dnetlib.actionmanager.common.Agent; -//import eu.dnetlib.data.mapreduce.hbase.Reporter; -//import eu.dnetlib.data.mapreduce.util.StreamUtils; -//import eu.dnetlib.data.proto.FieldTypeProtos; -//import eu.dnetlib.data.proto.FieldTypeProtos.Author; -//import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo; -//import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue; -//import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; -//import eu.dnetlib.data.proto.FieldTypeProtos.StringField; -//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; -//import eu.dnetlib.data.proto.KindProtos; -//import eu.dnetlib.data.proto.OafProtos; -//import eu.dnetlib.data.proto.ResultProtos; -//import eu.dnetlib.data.proto.TypeProtos; -//import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions; -//import eu.dnetlib.miscutils.collections.Pair; -//import eu.dnetlib.miscutils.datetime.DateUtils; -//import eu.dnetlib.pace.model.Person; -// -//public class ProtoWriter { -// -// public static final String ORCID = "ORCID"; -// public final static String orcidPREFIX = "orcid_______"; -// public static final String OPENAIRE_PREFIX = "openaire____"; -// public static final String SEPARATOR = "::"; -// -// private static Map> datasources = new HashMap>() { -// -// { -// put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); -// -// } -// }; -// -// // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname -// private static Map> externalIds = new HashMap>() { -// -// { -// put("ark".toLowerCase(), new Pair<>("ark", "ark")); -// put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv")); -// put("pmc".toLowerCase(), new Pair<>("pmc", "pmc")); -// put("pmid".toLowerCase(), new Pair<>("pmid", "pmid")); -// put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid")); -// put("urn".toLowerCase(), new Pair<>("urn", "urn")); -// } -// }; -// -// static Map> typologiesMapping; -// -// static { -// try { -// final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json"); -// final String tt = IOUtils.toString(is); -// typologiesMapping = new Gson().fromJson(tt, Map.class); -// } catch (final IOException e) { -// e.printStackTrace(); -// } -// } -// -// public static final String PID_TYPES = "dnet:pid_types"; -// -// public static List generatePublicationActionsFromDump(final JsonObject rootElement, -// final ActionFactory factory, -// final String setName, -// final Agent agent, -// final Reporter context) { -// -// if (!isValid(rootElement, context)) { return null; } -// -// // Create OAF proto -// -// final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder(); -// -// oaf.setDataInfo( -// DataInfo.newBuilder() -// .setDeletedbyinference(false) -// .setInferred(false) -// .setTrust("0.9") -// .setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions")) -// .build()); -// -// // Adding kind -// oaf.setKind(KindProtos.Kind.entity); -// -// oaf.setLastupdatetimestamp(DateUtils.now()); -// -// // creating result proto -// final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result); -// -// entity.setDateofcollection("2018-10-22"); -// entity.setDateoftransformation(DateUtils.now_ISO8601()); -// -// // Adding external ids -// StreamUtils.toStream(externalIds.keySet().iterator()) -// .forEach(jsonExtId -> { -// final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue(); -// final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey(); -// final String extId = getStringValue(rootElement, jsonExtId); -// if (StringUtils.isNotBlank(extId)) { -// entity.addPid(StructuredProperty.newBuilder() -// .setValue(extId) -// .setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types") -// .setSchemename("dnet:pid_types").build()) -// .build()); -// } -// }); -// -// // Create result field -// final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder(); -// -// // Create metadata proto -// final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder(); -// -// // Adding source -// final String source = getStringValue(rootElement, "source"); -// if (StringUtils.isNotBlank(source)) { -// metadata.addSource(StringField.newBuilder().setValue(source).build()); -// } -// -// // Adding title -// final String title = createRepeatedField(rootElement, "titles"); -// if (StringUtils.isBlank(title)) { -// context.incrementCounter("filtered", "title_not_found", 1); -// return null; -// } -// metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder() -// .setValue(title) -// .setQualifier(getQualifier("main title", "dnet:dataCite_title")) -// .build()); -// -// // Adding identifier -// final String id = getStringValue(rootElement, "id"); -// String sourceId = null; -// if (id != null) { -// entity.addOriginalId(id); -// sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id)); -// } else { -// sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title)); -// } -// entity.setId(sourceId); -// -// // Adding relevant date -// settingRelevantDate(rootElement, metadata, "publication_date", "issued", true); -// -// // Adding collectedfrom -// final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder() -// .setValue(ORCID) -// .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a") -// .build(); -// entity.addCollectedfrom(collectedFrom); -// -// // Adding type -// final String type = getStringValue(rootElement, "type"); -// String cobjValue = ""; -// if (StringUtils.isNotBlank(type)) { -// -// metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder() -// .setClassid(type) -// .setClassname(type) -// .setSchemeid("dnet:dataCite_resource") -// .setSchemename("dnet:dataCite_resource") -// .build()); -// -// final String typeValue = typologiesMapping.get(type).get("value"); -// cobjValue = typologiesMapping.get(type).get("cobj"); -// final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder(); -// -// // Adding hostedby -// instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder() -// .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c") -// .setValue("Unknown Repository") -// .build()); -// -// // Adding url -// final String url = createRepeatedField(rootElement, "urls"); -// if (StringUtils.isNotBlank(url)) { -// instance.addUrl(url); -// } -// -// final String pubDate = getPublicationDate(rootElement, "publication_date"); -// if (StringUtils.isNotBlank(pubDate)) { -// instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build()); -// } -// -// instance.setCollectedfrom(collectedFrom); -// -// // Adding accessright -// instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder() -// .setClassid("UNKNOWN") -// .setClassname("UNKNOWN") -// .setSchemeid("dnet:access_modes") -// .setSchemename("dnet:access_modes") -// .build()); -// -// // Adding type -// instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder() -// .setClassid(cobjValue) -// .setClassname(typeValue) -// .setSchemeid("dnet:publication_resource") -// .setSchemename("dnet:publication_resource") -// .build()); -// -// result.addInstance(instance); -// } else { -// context.incrementCounter("filtered", "type_not_found", 1); -// return null; -// } -// -// // Adding authors -// final List authors = createAuthors(rootElement); -// if (authors != null && authors.size() > 0) { -// metadata.addAllAuthor(authors); -// } else { -// context.incrementCounter("filtered", "author_not_found", 1); -// return null; -// } -// -// metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies")); -// result.setMetadata(metadata.build()); -// entity.setResult(result.build()); -// oaf.setEntity(entity.build()); -// -// final List actionList = new ArrayList<>(); -// -// actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray())); -// -//// System.out.println(JsonFormat.printToString(oaf.build())); -// return actionList; -// -// } -// -// public static List createAuthors(final JsonObject root) { -// -// final String authorsJSONFieldName = "authors"; -// -// if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) { -// -// final List authors = new ArrayList<>(); -// final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName); -// int firstCounter = 0; -// int defaultCounter = 0; -// int rank = 1; -// int currentRank = 0; -// -// for (final JsonElement item : jsonAuthors) { -// final JsonObject author = item.getAsJsonObject(); -// final Author.Builder result = Author.newBuilder(); -// if (item.isJsonObject()) { -// final String surname = getStringValue(author, "surname"); -// final String name = getStringValue(author, "name"); -// final String oid = getStringValue(author, "oid"); -// final String seq = getStringValue(author, "seq"); -// if (StringUtils.isNotBlank(seq)) { -// if (seq.equals("first")) { -// firstCounter += 1; -// rank = firstCounter; -// -// } else if (seq.equals("additional")) { -// rank = currentRank + 1; -// } else { -// defaultCounter += 1; -// rank = defaultCounter; -// } -// } -// -// if (StringUtils.isNotBlank(oid)) { -// result.addPid(KeyValue.newBuilder() -// .setValue(oid) -// .setKey("ORCID") -// .build()); -// result.setFullname(name + " " + surname); -// if (StringUtils.isNotBlank(name)) { -// result.setName(name); -// } -// if (StringUtils.isNotBlank(surname)) { -// result.setSurname(surname); -// } -// } else { -// String fullname = ""; -// if (StringUtils.isNotBlank(name)) { -// fullname = name; -// } else { -// if (StringUtils.isNotBlank(surname)) { -// fullname = surname; -// } -// } -// Person p = new Person(fullname, false); -// if (p.isAccurate()) { -// result.setName(p.getNormalisedFirstName()); -// result.setSurname(p.getNormalisedSurname()); -// result.setFullname(p.getNormalisedFullname()); -// } -// else { -// result.setFullname(fullname); -// } -// } -// } -// result.setRank(rank); -// authors.add(result.build()); -// currentRank = rank; -// } -// return authors; -// -// } -// return null; -// } -// -// private static String createRepeatedField(final JsonObject rootElement, final String fieldName) { -// String field = ""; -// if (!rootElement.has(fieldName)) { return null; } -// if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; } -// if (rootElement.get(fieldName).isJsonArray()) { -// if (!isValidJsonArray(rootElement, fieldName)) { return null; } -// final StringBuilder ttl = new StringBuilder(); -// getArrayValues(rootElement, fieldName).forEach(ttl::append); -// field = ttl.toString(); -// } else { -// field = getStringValue(rootElement, fieldName); -// } -// -// if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') { -// field = field.substring(1, field.length() - 1); -// } -// return field; -// } -// -// private static void settingRelevantDate(final JsonObject rootElement, -// final ResultProtos.Result.Metadata.Builder metadata, -// final String jsonKey, -// final String dictionaryKey, -// final boolean addToDateOfAcceptance) { -// -// final String pubDate = getPublicationDate(rootElement, "publication_date"); -// if (StringUtils.isNotBlank(pubDate)) { -// if (addToDateOfAcceptance) { -// metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build()); -// } -// metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder() -// .setValue(pubDate) -// .setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date")) -// .build()); -// } -// } -// -// private static String getPublicationDate(final JsonObject rootElement, -// final String jsonKey) { -// -// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); -// if (pubDateJson == null) { return null; } -// final String year = getStringValue(pubDateJson, "year"); -// final String month = getStringValue(pubDateJson, "month"); -// final String day = getStringValue(pubDateJson, "day"); -// -// if (StringUtils.isBlank(year)) { return null; } -// String pubDate = "".concat(year); -// if (StringUtils.isNotBlank(month)) { -// pubDate = pubDate.concat("-" + month); -// if (StringUtils.isNotBlank(day)) { -// pubDate = pubDate.concat("-" + day); -// } else { -// pubDate += "-01"; -// } -// } else { -// pubDate += "-01-01"; -// } -// if (isValidDate(pubDate)) { return pubDate; } -// return null; -// } -// -// protected static boolean isValid(final JsonObject rootElement, final Reporter context) { -// -// final String type = getStringValue(rootElement, "type"); -// if (!typologiesMapping.containsKey(type)) { -// context.incrementCounter("filtered", "unknowntype_" + type, 1); -// return false; -// } -// -// if (!isValidJsonArray(rootElement, "titles")) { -// context.incrementCounter("filtered", "invalid_title", 1); -// return false; -// } -// return true; -// } -// -// private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) { -// if (!rootElement.has(fieldName)) { return false; } -// final JsonElement jsonElement = rootElement.get(fieldName); -// if (jsonElement.isJsonNull()) { return false; } -// if (jsonElement.isJsonArray()) { -// final JsonArray jsonArray = jsonElement.getAsJsonArray(); -// if (jsonArray.isJsonNull()) { return false; } -// if (jsonArray.get(0).isJsonNull()) { return false; } -// } -// return true; -// } -//} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java new file mode 100644 index 000000000..c460f6299 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/DumpToActionsUtility.java @@ -0,0 +1,107 @@ +package eu.dnetlib.doiboost.orcidnodoi.util; + +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import org.apache.commons.lang3.StringUtils; + +import java.text.SimpleDateFormat; +import java.util.*; + +public class DumpToActionsUtility { + + private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US); + + public static String getStringValue(final JsonObject root, final String key) { + if (root.has(key) && !root.get(key).isJsonNull()) + return root.get(key).getAsString(); + return null; + } + + public static List getArrayValues(final JsonObject root, final String key) { + if (root.has(key) && root.get(key).isJsonArray()) { + final JsonArray asJsonArray = root.get(key).getAsJsonArray(); + final List result = new ArrayList<>(); + + + asJsonArray.forEach(it -> { + if (StringUtils.isNotBlank(it.getAsString())) { + result.add(it.getAsString()); + } + }); + return result; + } + return new ArrayList<>(); + } + public static List getArrayObjects(final JsonObject root, final String key) { + if (root.has(key) && root.get(key).isJsonArray()) { + final JsonArray asJsonArray = root.get(key).getAsJsonArray(); + final List result = new ArrayList<>(); + asJsonArray.forEach(it -> { + if (it.getAsJsonObject() != null) { + result.add(it.getAsJsonObject()); + } + }); + return result; + } + return new ArrayList<>(); + } + + public static boolean isValidDate(final String date) { + return date.matches("\\d{4}-\\d{2}-\\d{2}"); + } + + public static String now_ISO8601() { // NOPMD + String result; + synchronized (ISO8601FORMAT) { + result = ISO8601FORMAT.format(new Date()); + } + //convert YYYYMMDDTHH:mm:ss+HH00 into YYYYMMDDTHH:mm:ss+HH:00 + //- note the added colon for the Timezone + return result.substring(0, result.length() - 2) + ":" + result.substring(result.length() - 2); + } + + public static String getDefaultResulttype(final String cobjcategory) { + switch (cobjcategory) { + case "0029": + return "software"; + case "0021": + case "0024": + case "0025": + case "0030": + return "dataset"; + case "0000": + case "0010": + case "0018": + case "0020": + case "0022": + case "0023": + case "0026": + case "0027": + case "0028": + case "0037": + return "other"; + case "0001": + case "0002": + case "0004": + case "0005": + case "0006": + case "0007": + case "0008": + case "0009": + case "0011": + case "0012": + case "0013": + case "0014": + case "0015": + case "0016": + case "0017": + case "0019": + case "0031": + case "0032": + return "publication"; + default: + return "publication"; + } + } + +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java new file mode 100644 index 000000000..58c09af60 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/util/Pair.java @@ -0,0 +1,30 @@ +package eu.dnetlib.doiboost.orcidnodoi.util; + +public class Pair { + + private K k; + + private V v; + + public Pair(K k, V v) { + this.k = k; + this.v = v; + } + + public K getKey() { + return k; + } + + public V getValue() { + return v; + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof Pair) { + Pair tmp = (Pair) obj; + return k.equals(tmp.getKey()) && v.equals(tmp.getValue()); + } else return false; + } + +} diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json new file mode 100644 index 000000000..cb696f279 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json @@ -0,0 +1,41 @@ +{ + "reference-entry": {"cobj":"0013", "value": "Part of book or chapter of book"}, + "report": {"cobj":"0017", "value": "Report"}, + "dataset": {"cobj":"0021", "value": "Dataset"}, + "journal-article": {"cobj":"0001", "value": "Article"}, + "reference-book": {"cobj":"0002", "value": "Book"}, + "other": {"cobj":"0020", "value": "Other ORP type"}, + "proceedings-article": {"cobj":"0004", "value": "Conference object"}, + "standard": {"cobj":"0038", "value": "Other literature type"}, + "book-part": {"cobj":"0002", "value": "Book"}, + "monograph": {"cobj":"0002", "value": "Book"}, + "report-series": {"cobj":"0017", "value": "Report"}, + "book": {"cobj":"0002", "value": "Book"}, + "book-chapter": {"cobj":"0013", "value": "Part of book or chapter of book"}, + "peer-review": {"cobj":"0015", "value": "Review"}, + "book-section": {"cobj":"0013", "value": "Part of book or chapter of book"}, + "book-review": {"cobj":"0015", "value": "Review"}, + "conference-abstract": {"cobj":"0004", "value": "Conference object"}, + "conference-paper": {"cobj":"0004", "value": "Conference object"}, + "conference-poster": {"cobj":"0004", "value": "Conference object"}, + "data-set": {"cobj":"0021", "value": "Dataset"}, + "dictionary-entry": {"cobj":"0038", "value": "Other literature type"}, + "disclosure": {"cobj":"0038", "value": "Other literature type"}, + "dissertation": {"cobj":"0006", "value": "Doctoral thesis"}, + "edited-book": {"cobj":"0002", "value": "Book"}, + "encyclopedia-entry": {"cobj":"0038", "value": "Other literature type"}, + "lecture-speech": {"cobj":"0010", "value": "Lecture"}, + "license": {"cobj":"0038", "value": "Other literature type"}, + "magazine-article": {"cobj":"0005", "value": "Contribution for newspaper or weekly magazine"}, + "manual": {"cobj":"0038", "value": "Other literature type"}, + "newsletter-article": {"cobj":"0012", "value": "Newsletter"}, + "newspaper-article": {"cobj":"0005", "value": "Contribution for newspaper or weekly magazine"}, + "patent": {"cobj":"0019", "value": "Patent"}, + "research-technique": {"cobj":"0020", "value": "Other ORP type"}, + "research-tool": {"cobj":"0020", "value": "Other ORP type"}, + "standards-and-policy": {"cobj":"0038", "value": "Other literature type"}, + "supervised-student-publication": {"cobj":"0001", "value": "Article"}, + "technical-standard": {"cobj":"0038", "value": "Other literature type"}, + "website": {"cobj":"0020", "value": "Other ORP type"}, + "working-paper": {"cobj":"0014", "value": "Research"} +} \ No newline at end of file