forked from D-Net/dnet-hadoop
converter from orcid work json to oaf
This commit is contained in:
parent
b7b6be12a5
commit
5525f57ec8
|
@ -0,0 +1,420 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcidnodoi.oaf;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.JsonArray;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonObject;
|
||||
import eu.dnetlib.dhp.common.PacePerson;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*;
|
||||
|
||||
public class OrcidWorkToOAF {
|
||||
|
||||
static Logger logger = LoggerFactory.getLogger(OrcidWorkToOAF.class);
|
||||
|
||||
public static final String ORCID = "ORCID";
|
||||
public final static String orcidPREFIX = "orcid_______";
|
||||
public static final String OPENAIRE_PREFIX = "openaire____";
|
||||
public static final String SEPARATOR = "::";
|
||||
|
||||
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
||||
|
||||
{
|
||||
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
|
||||
private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
|
||||
|
||||
{
|
||||
put("ark".toLowerCase(), new Pair<>("ark", "ark"));
|
||||
put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
|
||||
put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
|
||||
put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
|
||||
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
|
||||
put("urn".toLowerCase(), new Pair<>("urn", "urn"));
|
||||
}
|
||||
};
|
||||
|
||||
static Map<String, Map<String, String>> typologiesMapping;
|
||||
|
||||
static {
|
||||
try {
|
||||
final String tt = IOUtils.toString(OrcidWorkToOAF.class.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json"));
|
||||
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
||||
} catch (final Exception e) {
|
||||
logger.error("loading typologies", e);
|
||||
}
|
||||
}
|
||||
|
||||
public static final String PID_TYPES = "dnet:pid_types";
|
||||
|
||||
public static Oaf generatePublicationActionsFromDump(final JsonObject rootElement, final String setName) {
|
||||
|
||||
if (!isValid(rootElement/*, context*/)) { return null; }
|
||||
|
||||
Publication publication = new Publication();
|
||||
|
||||
final DataInfo dataInfo = new DataInfo();
|
||||
dataInfo.setDeletedbyinference(false);
|
||||
dataInfo.setInferred(false);
|
||||
dataInfo.setTrust("0.9");
|
||||
dataInfo.setProvenanceaction(
|
||||
mapQualifier(
|
||||
"sysimport:actionset:orcidworks-no-doi",
|
||||
"sysimport:actionset:orcidworks-no-doi",
|
||||
"dnet:provenanceActions",
|
||||
"dnet:provenanceActions"));
|
||||
publication.setDataInfo(dataInfo);
|
||||
|
||||
publication.setLastupdatetimestamp(new Date().getTime());
|
||||
|
||||
publication.setDateofcollection("2019-10-22");
|
||||
publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
|
||||
|
||||
// Adding external ids
|
||||
externalIds.keySet().stream()
|
||||
.forEach(jsonExtId -> {
|
||||
final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
|
||||
final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
|
||||
final String extId = getStringValue(rootElement, jsonExtId);
|
||||
if (StringUtils.isNotBlank(extId)) {
|
||||
publication.getExternalReference().add(
|
||||
convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
|
||||
}
|
||||
});
|
||||
|
||||
// Adding source
|
||||
// final String source = getStringValue(rootElement, "source");
|
||||
// if (StringUtils.isNotBlank(source)) {
|
||||
// metadata.addSource(StringField.newBuilder().setValue(source).build());
|
||||
// }
|
||||
|
||||
// Adding titles
|
||||
final List<String> titles = createRepeatedField(rootElement, "titles");
|
||||
if (titles==null || titles.isEmpty()) {
|
||||
// context.incrementCounter("filtered", "title_not_found", 1);
|
||||
return null;
|
||||
}
|
||||
Qualifier q = mapQualifier("main title","main title","dnet:dataCite_title","dnet:dataCite_title");
|
||||
publication.setTitle(
|
||||
titles
|
||||
.stream()
|
||||
.map(t -> {
|
||||
return mapStructuredProperty(t, q, null);
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
// Adding identifier
|
||||
final String id = getStringValue(rootElement, "id");
|
||||
String sourceId = null;
|
||||
if (id != null) {
|
||||
publication.setOriginalId(Arrays.asList(id));
|
||||
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(id.toLowerCase()));
|
||||
} else {
|
||||
String mergedTitle = titles.stream().map(Object::toString).collect(Collectors.joining(","));
|
||||
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(mergedTitle.toLowerCase()));
|
||||
}
|
||||
publication.setId(sourceId);
|
||||
|
||||
// Adding relevant date
|
||||
settingRelevantDate(rootElement, publication, "publication_date", "issued", true);
|
||||
|
||||
// Adding collectedfrom
|
||||
publication.setCollectedfrom(Arrays.asList(createCollectedFrom()));
|
||||
|
||||
// Adding type
|
||||
final String type = getStringValue(rootElement, "type");
|
||||
String cobjValue = "";
|
||||
if (StringUtils.isNotBlank(type)) {
|
||||
publication.setResourcetype(mapQualifier(type, type, "dnet:dataCite_resource", "dnet:dataCite_resource"));
|
||||
|
||||
final String typeValue = typologiesMapping.get(type).get("value");
|
||||
cobjValue = typologiesMapping.get(type).get("cobj");
|
||||
final Instance instance = new Instance();
|
||||
|
||||
// Adding hostedby
|
||||
instance.setHostedby(createHostedBy());
|
||||
|
||||
// Adding url
|
||||
final List<String> urls = createRepeatedField(rootElement, "urls");
|
||||
if (urls!=null && !urls.isEmpty()) {
|
||||
instance.setUrl(urls);
|
||||
}
|
||||
|
||||
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
||||
if (StringUtils.isNotBlank(pubDate)) {
|
||||
instance.setDateofacceptance(mapStringField(pubDate, null));
|
||||
}
|
||||
|
||||
instance.setCollectedfrom(createCollectedFrom());
|
||||
|
||||
// Adding accessright
|
||||
instance.setAccessright(mapQualifier("UNKNOWN", "UNKNOWN", "dnet:access_modes", "dnet:access_modes"));
|
||||
|
||||
// Adding type
|
||||
instance.setInstancetype(mapQualifier(cobjValue, typeValue, "dnet:publication_resource", "dnet:publication_resource"));
|
||||
|
||||
publication.setInstance(Arrays.asList(instance));
|
||||
} else {
|
||||
// context.incrementCounter("filtered", "type_not_found", 1);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Adding authors
|
||||
final List<Author> authors = createAuthors(rootElement);
|
||||
if (authors != null && authors.size() > 0) {
|
||||
publication.setAuthor(authors);
|
||||
} else {
|
||||
// context.incrementCounter("filtered", "author_not_found", 1);
|
||||
return null;
|
||||
}
|
||||
String classValue = getDefaultResulttype(cobjValue);
|
||||
publication.setResulttype(mapQualifier(classValue, classValue,"dnet:result_typologies", "dnet:result_typologies"));
|
||||
return publication;
|
||||
}
|
||||
|
||||
public static List<Author> createAuthors(final JsonObject root) {
|
||||
|
||||
final String authorsJSONFieldName = "authors";
|
||||
|
||||
if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
|
||||
|
||||
final List<Author> authors = new ArrayList<>();
|
||||
final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
|
||||
int firstCounter = 0;
|
||||
int defaultCounter = 0;
|
||||
int rank = 1;
|
||||
int currentRank = 0;
|
||||
|
||||
for (final JsonElement item : jsonAuthors) {
|
||||
final JsonObject jsonAuthor = item.getAsJsonObject();
|
||||
final Author author = new Author();
|
||||
if (item.isJsonObject()) {
|
||||
final String surname = getStringValue(jsonAuthor, "surname");
|
||||
final String name = getStringValue(jsonAuthor, "name");
|
||||
final String oid = getStringValue(jsonAuthor, "oid");
|
||||
final String seq = getStringValue(jsonAuthor, "seq");
|
||||
if (StringUtils.isNotBlank(seq)) {
|
||||
if (seq.equals("first")) {
|
||||
firstCounter += 1;
|
||||
rank = firstCounter;
|
||||
|
||||
} else if (seq.equals("additional")) {
|
||||
rank = currentRank + 1;
|
||||
} else {
|
||||
defaultCounter += 1;
|
||||
rank = defaultCounter;
|
||||
}
|
||||
}
|
||||
|
||||
if (StringUtils.isNotBlank(oid)) {
|
||||
author.setPid(Arrays.asList(mapAuthorId(oid)));
|
||||
author.setFullname(name + " " + surname);
|
||||
if (StringUtils.isNotBlank(name)) {
|
||||
author.setName(name);
|
||||
}
|
||||
if (StringUtils.isNotBlank(surname)) {
|
||||
author.setSurname(surname);
|
||||
}
|
||||
} else {
|
||||
String fullname = "";
|
||||
if (StringUtils.isNotBlank(name)) {
|
||||
fullname = name;
|
||||
} else {
|
||||
if (StringUtils.isNotBlank(surname)) {
|
||||
fullname = surname;
|
||||
}
|
||||
}
|
||||
PacePerson p = new PacePerson(fullname, false);
|
||||
if (p.isAccurate()) {
|
||||
author.setName(p.getNormalisedFirstName());
|
||||
author.setSurname(p.getNormalisedSurname());
|
||||
author.setFullname(p.getNormalisedFullname());
|
||||
}
|
||||
else {
|
||||
author.setFullname(fullname);
|
||||
}
|
||||
}
|
||||
}
|
||||
author.setRank(rank);
|
||||
authors.add(author);
|
||||
currentRank = rank;
|
||||
}
|
||||
return authors;
|
||||
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static List<String> createRepeatedField(final JsonObject rootElement, final String fieldName) {
|
||||
if (!rootElement.has(fieldName)) { return null; }
|
||||
if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
|
||||
if (rootElement.get(fieldName).isJsonArray()) {
|
||||
if (!isValidJsonArray(rootElement, fieldName)) { return null; }
|
||||
return getArrayValues(rootElement, fieldName);
|
||||
} else {
|
||||
String field = getStringValue(rootElement, fieldName);
|
||||
return Arrays.asList(cleanField(field));
|
||||
}
|
||||
}
|
||||
|
||||
private static String cleanField(String value) {
|
||||
if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') {
|
||||
value = value.substring(1, value.length() - 1);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private static void settingRelevantDate(final JsonObject rootElement,
|
||||
final Publication publication,
|
||||
final String jsonKey,
|
||||
final String dictionaryKey,
|
||||
final boolean addToDateOfAcceptance) {
|
||||
|
||||
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
||||
if (StringUtils.isNotBlank(pubDate)) {
|
||||
if (addToDateOfAcceptance) {
|
||||
publication.setDateofacceptance(mapStringField(pubDate, null));
|
||||
}
|
||||
Qualifier q = mapQualifier(dictionaryKey,dictionaryKey,"dnet:dataCite_date","dnet:dataCite_date");
|
||||
publication.setRelevantdate(
|
||||
Arrays.asList(pubDate)
|
||||
.stream()
|
||||
.map(r -> {
|
||||
return mapStructuredProperty(r, q, null);
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
private static String getPublicationDate(final JsonObject rootElement,
|
||||
final String jsonKey) {
|
||||
|
||||
final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
||||
if (pubDateJson == null) { return null; }
|
||||
final String year = getStringValue(pubDateJson, "year");
|
||||
final String month = getStringValue(pubDateJson, "month");
|
||||
final String day = getStringValue(pubDateJson, "day");
|
||||
|
||||
if (StringUtils.isBlank(year)) { return null; }
|
||||
String pubDate = "".concat(year);
|
||||
if (StringUtils.isNotBlank(month)) {
|
||||
pubDate = pubDate.concat("-" + month);
|
||||
if (StringUtils.isNotBlank(day)) {
|
||||
pubDate = pubDate.concat("-" + day);
|
||||
} else {
|
||||
pubDate += "-01";
|
||||
}
|
||||
} else {
|
||||
pubDate += "-01-01";
|
||||
}
|
||||
if (isValidDate(pubDate)) { return pubDate; }
|
||||
return null;
|
||||
}
|
||||
|
||||
protected static boolean isValid(final JsonObject rootElement/*, final Reporter context*/) {
|
||||
|
||||
final String type = getStringValue(rootElement, "type");
|
||||
if (!typologiesMapping.containsKey(type)) {
|
||||
// context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!isValidJsonArray(rootElement, "titles")) {
|
||||
// context.incrementCounter("filtered", "invalid_title", 1);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
|
||||
if (!rootElement.has(fieldName)) { return false; }
|
||||
final JsonElement jsonElement = rootElement.get(fieldName);
|
||||
if (jsonElement.isJsonNull()) { return false; }
|
||||
if (jsonElement.isJsonArray()) {
|
||||
final JsonArray jsonArray = jsonElement.getAsJsonArray();
|
||||
if (jsonArray.isJsonNull()) { return false; }
|
||||
if (jsonArray.get(0).isJsonNull()) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) {
|
||||
final Qualifier qualifier = new Qualifier();
|
||||
qualifier.setClassid(classId);
|
||||
qualifier.setClassname(className);
|
||||
qualifier.setSchemeid(schemeId);
|
||||
qualifier.setSchemename(schemeName);
|
||||
return qualifier;
|
||||
}
|
||||
|
||||
private static ExternalReference convertExtRef(String extId, String classId, String className, String schemeId, String schemeName) {
|
||||
ExternalReference ex = new ExternalReference();
|
||||
ex.setRefidentifier(extId);
|
||||
ex.setQualifier(mapQualifier(classId, className, schemeId, schemeName ));
|
||||
return ex;
|
||||
}
|
||||
|
||||
private static StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) {
|
||||
if (value == null | StringUtils.isBlank(value)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final StructuredProperty structuredProperty = new StructuredProperty();
|
||||
structuredProperty.setValue(value);
|
||||
structuredProperty.setQualifier(qualifier);
|
||||
structuredProperty.setDataInfo(dataInfo);
|
||||
return structuredProperty;
|
||||
}
|
||||
|
||||
private static Field<String> mapStringField(String value, DataInfo dataInfo) {
|
||||
if (value == null || StringUtils.isBlank(value)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final Field<String> stringField = new Field<>();
|
||||
stringField.setValue(value);
|
||||
stringField.setDataInfo(dataInfo);
|
||||
return stringField;
|
||||
}
|
||||
|
||||
private static KeyValue createCollectedFrom() {
|
||||
KeyValue cf = new KeyValue();
|
||||
cf.setValue(ORCID);
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
|
||||
return cf;
|
||||
}
|
||||
|
||||
private static KeyValue createHostedBy() {
|
||||
KeyValue hb = new KeyValue();
|
||||
hb.setValue("Unknown Repository");
|
||||
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
|
||||
return hb;
|
||||
}
|
||||
|
||||
private static StructuredProperty mapAuthorId(String orcidId) {
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(orcidId);
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid("ORCID");
|
||||
q.setClassname("ORCID");
|
||||
sp.setQualifier(q);
|
||||
return sp;
|
||||
}
|
||||
}
|
|
@ -1,427 +0,0 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcidnodoi.proto;
|
||||
|
||||
public class ProtoWriter {
|
||||
|
||||
}
|
||||
//
|
||||
//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
|
||||
//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype;
|
||||
//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
|
||||
//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
|
||||
//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
|
||||
//
|
||||
//import java.io.IOException;
|
||||
//import java.io.InputStream;
|
||||
//import java.util.ArrayList;
|
||||
//import java.util.HashMap;
|
||||
//import java.util.List;
|
||||
//import java.util.Map;
|
||||
//
|
||||
//import org.apache.commons.io.IOUtils;
|
||||
//import org.apache.commons.lang3.StringUtils;
|
||||
//
|
||||
//import com.google.gson.Gson;
|
||||
//import com.google.gson.JsonArray;
|
||||
//import com.google.gson.JsonElement;
|
||||
//import com.google.gson.JsonObject;
|
||||
//import com.googlecode.protobuf.format.JsonFormat;
|
||||
//
|
||||
//import eu.dnetlib.actionmanager.actions.ActionFactory;
|
||||
//import eu.dnetlib.actionmanager.actions.AtomicAction;
|
||||
//import eu.dnetlib.actionmanager.common.Agent;
|
||||
//import eu.dnetlib.data.mapreduce.hbase.Reporter;
|
||||
//import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos.Author;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
|
||||
//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
||||
//import eu.dnetlib.data.proto.KindProtos;
|
||||
//import eu.dnetlib.data.proto.OafProtos;
|
||||
//import eu.dnetlib.data.proto.ResultProtos;
|
||||
//import eu.dnetlib.data.proto.TypeProtos;
|
||||
//import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
||||
//import eu.dnetlib.miscutils.collections.Pair;
|
||||
//import eu.dnetlib.miscutils.datetime.DateUtils;
|
||||
//import eu.dnetlib.pace.model.Person;
|
||||
//
|
||||
//public class ProtoWriter {
|
||||
//
|
||||
// public static final String ORCID = "ORCID";
|
||||
// public final static String orcidPREFIX = "orcid_______";
|
||||
// public static final String OPENAIRE_PREFIX = "openaire____";
|
||||
// public static final String SEPARATOR = "::";
|
||||
//
|
||||
// private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
||||
//
|
||||
// {
|
||||
// put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||
//
|
||||
// }
|
||||
// };
|
||||
//
|
||||
// // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
|
||||
// private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
|
||||
//
|
||||
// {
|
||||
// put("ark".toLowerCase(), new Pair<>("ark", "ark"));
|
||||
// put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
|
||||
// put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
|
||||
// put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
|
||||
// put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
|
||||
// put("urn".toLowerCase(), new Pair<>("urn", "urn"));
|
||||
// }
|
||||
// };
|
||||
//
|
||||
// static Map<String, Map<String, String>> typologiesMapping;
|
||||
//
|
||||
// static {
|
||||
// try {
|
||||
// final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
|
||||
// final String tt = IOUtils.toString(is);
|
||||
// typologiesMapping = new Gson().fromJson(tt, Map.class);
|
||||
// } catch (final IOException e) {
|
||||
// e.printStackTrace();
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// public static final String PID_TYPES = "dnet:pid_types";
|
||||
//
|
||||
// public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
|
||||
// final ActionFactory factory,
|
||||
// final String setName,
|
||||
// final Agent agent,
|
||||
// final Reporter context) {
|
||||
//
|
||||
// if (!isValid(rootElement, context)) { return null; }
|
||||
//
|
||||
// // Create OAF proto
|
||||
//
|
||||
// final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
||||
//
|
||||
// oaf.setDataInfo(
|
||||
// DataInfo.newBuilder()
|
||||
// .setDeletedbyinference(false)
|
||||
// .setInferred(false)
|
||||
// .setTrust("0.9")
|
||||
// .setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions"))
|
||||
// .build());
|
||||
//
|
||||
// // Adding kind
|
||||
// oaf.setKind(KindProtos.Kind.entity);
|
||||
//
|
||||
// oaf.setLastupdatetimestamp(DateUtils.now());
|
||||
//
|
||||
// // creating result proto
|
||||
// final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
|
||||
//
|
||||
// entity.setDateofcollection("2018-10-22");
|
||||
// entity.setDateoftransformation(DateUtils.now_ISO8601());
|
||||
//
|
||||
// // Adding external ids
|
||||
// StreamUtils.toStream(externalIds.keySet().iterator())
|
||||
// .forEach(jsonExtId -> {
|
||||
// final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
|
||||
// final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
|
||||
// final String extId = getStringValue(rootElement, jsonExtId);
|
||||
// if (StringUtils.isNotBlank(extId)) {
|
||||
// entity.addPid(StructuredProperty.newBuilder()
|
||||
// .setValue(extId)
|
||||
// .setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
|
||||
// .setSchemename("dnet:pid_types").build())
|
||||
// .build());
|
||||
// }
|
||||
// });
|
||||
//
|
||||
// // Create result field
|
||||
// final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
|
||||
//
|
||||
// // Create metadata proto
|
||||
// final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
|
||||
//
|
||||
// // Adding source
|
||||
// final String source = getStringValue(rootElement, "source");
|
||||
// if (StringUtils.isNotBlank(source)) {
|
||||
// metadata.addSource(StringField.newBuilder().setValue(source).build());
|
||||
// }
|
||||
//
|
||||
// // Adding title
|
||||
// final String title = createRepeatedField(rootElement, "titles");
|
||||
// if (StringUtils.isBlank(title)) {
|
||||
// context.incrementCounter("filtered", "title_not_found", 1);
|
||||
// return null;
|
||||
// }
|
||||
// metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
|
||||
// .setValue(title)
|
||||
// .setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
||||
// .build());
|
||||
//
|
||||
// // Adding identifier
|
||||
// final String id = getStringValue(rootElement, "id");
|
||||
// String sourceId = null;
|
||||
// if (id != null) {
|
||||
// entity.addOriginalId(id);
|
||||
// sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
|
||||
// } else {
|
||||
// sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
|
||||
// }
|
||||
// entity.setId(sourceId);
|
||||
//
|
||||
// // Adding relevant date
|
||||
// settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
|
||||
//
|
||||
// // Adding collectedfrom
|
||||
// final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
||||
// .setValue(ORCID)
|
||||
// .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
|
||||
// .build();
|
||||
// entity.addCollectedfrom(collectedFrom);
|
||||
//
|
||||
// // Adding type
|
||||
// final String type = getStringValue(rootElement, "type");
|
||||
// String cobjValue = "";
|
||||
// if (StringUtils.isNotBlank(type)) {
|
||||
//
|
||||
// metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
|
||||
// .setClassid(type)
|
||||
// .setClassname(type)
|
||||
// .setSchemeid("dnet:dataCite_resource")
|
||||
// .setSchemename("dnet:dataCite_resource")
|
||||
// .build());
|
||||
//
|
||||
// final String typeValue = typologiesMapping.get(type).get("value");
|
||||
// cobjValue = typologiesMapping.get(type).get("cobj");
|
||||
// final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
||||
//
|
||||
// // Adding hostedby
|
||||
// instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
||||
// .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
||||
// .setValue("Unknown Repository")
|
||||
// .build());
|
||||
//
|
||||
// // Adding url
|
||||
// final String url = createRepeatedField(rootElement, "urls");
|
||||
// if (StringUtils.isNotBlank(url)) {
|
||||
// instance.addUrl(url);
|
||||
// }
|
||||
//
|
||||
// final String pubDate = getPublicationDate(rootElement, "publication_date");
|
||||
// if (StringUtils.isNotBlank(pubDate)) {
|
||||
// instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
||||
// }
|
||||
//
|
||||
// instance.setCollectedfrom(collectedFrom);
|
||||
//
|
||||
// // Adding accessright
|
||||
// instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
||||
// .setClassid("UNKNOWN")
|
||||
// .setClassname("UNKNOWN")
|
||||
// .setSchemeid("dnet:access_modes")
|
||||
// .setSchemename("dnet:access_modes")
|
||||
// .build());
|
||||
//
|
||||
// // Adding type
|
||||
// instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
||||
// .setClassid(cobjValue)
|
||||
// .setClassname(typeValue)
|
||||
// .setSchemeid("dnet:publication_resource")
|
||||
// .setSchemename("dnet:publication_resource")
|
||||
// .build());
|
||||
//
|
||||
// result.addInstance(instance);
|
||||
// } else {
|
||||
// context.incrementCounter("filtered", "type_not_found", 1);
|
||||
// return null;
|
||||
// }
|
||||
//
|
||||
// // Adding authors
|
||||
// final List<Author> authors = createAuthors(rootElement);
|
||||
// if (authors != null && authors.size() > 0) {
|
||||
// metadata.addAllAuthor(authors);
|
||||
// } else {
|
||||
// context.incrementCounter("filtered", "author_not_found", 1);
|
||||
// return null;
|
||||
// }
|
||||
//
|
||||
// metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
|
||||
// result.setMetadata(metadata.build());
|
||||
// entity.setResult(result.build());
|
||||
// oaf.setEntity(entity.build());
|
||||
//
|
||||
// final List<AtomicAction> actionList = new ArrayList<>();
|
||||
//
|
||||
// actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
|
||||
//
|
||||
//// System.out.println(JsonFormat.printToString(oaf.build()));
|
||||
// return actionList;
|
||||
//
|
||||
// }
|
||||
//
|
||||
// public static List<Author> createAuthors(final JsonObject root) {
|
||||
//
|
||||
// final String authorsJSONFieldName = "authors";
|
||||
//
|
||||
// if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
|
||||
//
|
||||
// final List<Author> authors = new ArrayList<>();
|
||||
// final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
|
||||
// int firstCounter = 0;
|
||||
// int defaultCounter = 0;
|
||||
// int rank = 1;
|
||||
// int currentRank = 0;
|
||||
//
|
||||
// for (final JsonElement item : jsonAuthors) {
|
||||
// final JsonObject author = item.getAsJsonObject();
|
||||
// final Author.Builder result = Author.newBuilder();
|
||||
// if (item.isJsonObject()) {
|
||||
// final String surname = getStringValue(author, "surname");
|
||||
// final String name = getStringValue(author, "name");
|
||||
// final String oid = getStringValue(author, "oid");
|
||||
// final String seq = getStringValue(author, "seq");
|
||||
// if (StringUtils.isNotBlank(seq)) {
|
||||
// if (seq.equals("first")) {
|
||||
// firstCounter += 1;
|
||||
// rank = firstCounter;
|
||||
//
|
||||
// } else if (seq.equals("additional")) {
|
||||
// rank = currentRank + 1;
|
||||
// } else {
|
||||
// defaultCounter += 1;
|
||||
// rank = defaultCounter;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// if (StringUtils.isNotBlank(oid)) {
|
||||
// result.addPid(KeyValue.newBuilder()
|
||||
// .setValue(oid)
|
||||
// .setKey("ORCID")
|
||||
// .build());
|
||||
// result.setFullname(name + " " + surname);
|
||||
// if (StringUtils.isNotBlank(name)) {
|
||||
// result.setName(name);
|
||||
// }
|
||||
// if (StringUtils.isNotBlank(surname)) {
|
||||
// result.setSurname(surname);
|
||||
// }
|
||||
// } else {
|
||||
// String fullname = "";
|
||||
// if (StringUtils.isNotBlank(name)) {
|
||||
// fullname = name;
|
||||
// } else {
|
||||
// if (StringUtils.isNotBlank(surname)) {
|
||||
// fullname = surname;
|
||||
// }
|
||||
// }
|
||||
// Person p = new Person(fullname, false);
|
||||
// if (p.isAccurate()) {
|
||||
// result.setName(p.getNormalisedFirstName());
|
||||
// result.setSurname(p.getNormalisedSurname());
|
||||
// result.setFullname(p.getNormalisedFullname());
|
||||
// }
|
||||
// else {
|
||||
// result.setFullname(fullname);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// result.setRank(rank);
|
||||
// authors.add(result.build());
|
||||
// currentRank = rank;
|
||||
// }
|
||||
// return authors;
|
||||
//
|
||||
// }
|
||||
// return null;
|
||||
// }
|
||||
//
|
||||
// private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
|
||||
// String field = "";
|
||||
// if (!rootElement.has(fieldName)) { return null; }
|
||||
// if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
|
||||
// if (rootElement.get(fieldName).isJsonArray()) {
|
||||
// if (!isValidJsonArray(rootElement, fieldName)) { return null; }
|
||||
// final StringBuilder ttl = new StringBuilder();
|
||||
// getArrayValues(rootElement, fieldName).forEach(ttl::append);
|
||||
// field = ttl.toString();
|
||||
// } else {
|
||||
// field = getStringValue(rootElement, fieldName);
|
||||
// }
|
||||
//
|
||||
// if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
|
||||
// field = field.substring(1, field.length() - 1);
|
||||
// }
|
||||
// return field;
|
||||
// }
|
||||
//
|
||||
// private static void settingRelevantDate(final JsonObject rootElement,
|
||||
// final ResultProtos.Result.Metadata.Builder metadata,
|
||||
// final String jsonKey,
|
||||
// final String dictionaryKey,
|
||||
// final boolean addToDateOfAcceptance) {
|
||||
//
|
||||
// final String pubDate = getPublicationDate(rootElement, "publication_date");
|
||||
// if (StringUtils.isNotBlank(pubDate)) {
|
||||
// if (addToDateOfAcceptance) {
|
||||
// metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
||||
// }
|
||||
// metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
||||
// .setValue(pubDate)
|
||||
// .setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
||||
// .build());
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private static String getPublicationDate(final JsonObject rootElement,
|
||||
// final String jsonKey) {
|
||||
//
|
||||
// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
||||
// if (pubDateJson == null) { return null; }
|
||||
// final String year = getStringValue(pubDateJson, "year");
|
||||
// final String month = getStringValue(pubDateJson, "month");
|
||||
// final String day = getStringValue(pubDateJson, "day");
|
||||
//
|
||||
// if (StringUtils.isBlank(year)) { return null; }
|
||||
// String pubDate = "".concat(year);
|
||||
// if (StringUtils.isNotBlank(month)) {
|
||||
// pubDate = pubDate.concat("-" + month);
|
||||
// if (StringUtils.isNotBlank(day)) {
|
||||
// pubDate = pubDate.concat("-" + day);
|
||||
// } else {
|
||||
// pubDate += "-01";
|
||||
// }
|
||||
// } else {
|
||||
// pubDate += "-01-01";
|
||||
// }
|
||||
// if (isValidDate(pubDate)) { return pubDate; }
|
||||
// return null;
|
||||
// }
|
||||
//
|
||||
// protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
|
||||
//
|
||||
// final String type = getStringValue(rootElement, "type");
|
||||
// if (!typologiesMapping.containsKey(type)) {
|
||||
// context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
||||
// return false;
|
||||
// }
|
||||
//
|
||||
// if (!isValidJsonArray(rootElement, "titles")) {
|
||||
// context.incrementCounter("filtered", "invalid_title", 1);
|
||||
// return false;
|
||||
// }
|
||||
// return true;
|
||||
// }
|
||||
//
|
||||
// private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
|
||||
// if (!rootElement.has(fieldName)) { return false; }
|
||||
// final JsonElement jsonElement = rootElement.get(fieldName);
|
||||
// if (jsonElement.isJsonNull()) { return false; }
|
||||
// if (jsonElement.isJsonArray()) {
|
||||
// final JsonArray jsonArray = jsonElement.getAsJsonArray();
|
||||
// if (jsonArray.isJsonNull()) { return false; }
|
||||
// if (jsonArray.get(0).isJsonNull()) { return false; }
|
||||
// }
|
||||
// return true;
|
||||
// }
|
||||
//}
|
|
@ -0,0 +1,107 @@
|
|||
package eu.dnetlib.doiboost.orcidnodoi.util;
|
||||
|
||||
import com.google.gson.JsonArray;
|
||||
import com.google.gson.JsonObject;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
|
||||
public class DumpToActionsUtility {
|
||||
|
||||
private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
|
||||
|
||||
public static String getStringValue(final JsonObject root, final String key) {
|
||||
if (root.has(key) && !root.get(key).isJsonNull())
|
||||
return root.get(key).getAsString();
|
||||
return null;
|
||||
}
|
||||
|
||||
public static List<String> getArrayValues(final JsonObject root, final String key) {
|
||||
if (root.has(key) && root.get(key).isJsonArray()) {
|
||||
final JsonArray asJsonArray = root.get(key).getAsJsonArray();
|
||||
final List<String> result = new ArrayList<>();
|
||||
|
||||
|
||||
asJsonArray.forEach(it -> {
|
||||
if (StringUtils.isNotBlank(it.getAsString())) {
|
||||
result.add(it.getAsString());
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
public static List<JsonObject> getArrayObjects(final JsonObject root, final String key) {
|
||||
if (root.has(key) && root.get(key).isJsonArray()) {
|
||||
final JsonArray asJsonArray = root.get(key).getAsJsonArray();
|
||||
final List<JsonObject> result = new ArrayList<>();
|
||||
asJsonArray.forEach(it -> {
|
||||
if (it.getAsJsonObject() != null) {
|
||||
result.add(it.getAsJsonObject());
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
public static boolean isValidDate(final String date) {
|
||||
return date.matches("\\d{4}-\\d{2}-\\d{2}");
|
||||
}
|
||||
|
||||
public static String now_ISO8601() { // NOPMD
|
||||
String result;
|
||||
synchronized (ISO8601FORMAT) {
|
||||
result = ISO8601FORMAT.format(new Date());
|
||||
}
|
||||
//convert YYYYMMDDTHH:mm:ss+HH00 into YYYYMMDDTHH:mm:ss+HH:00
|
||||
//- note the added colon for the Timezone
|
||||
return result.substring(0, result.length() - 2) + ":" + result.substring(result.length() - 2);
|
||||
}
|
||||
|
||||
public static String getDefaultResulttype(final String cobjcategory) {
|
||||
switch (cobjcategory) {
|
||||
case "0029":
|
||||
return "software";
|
||||
case "0021":
|
||||
case "0024":
|
||||
case "0025":
|
||||
case "0030":
|
||||
return "dataset";
|
||||
case "0000":
|
||||
case "0010":
|
||||
case "0018":
|
||||
case "0020":
|
||||
case "0022":
|
||||
case "0023":
|
||||
case "0026":
|
||||
case "0027":
|
||||
case "0028":
|
||||
case "0037":
|
||||
return "other";
|
||||
case "0001":
|
||||
case "0002":
|
||||
case "0004":
|
||||
case "0005":
|
||||
case "0006":
|
||||
case "0007":
|
||||
case "0008":
|
||||
case "0009":
|
||||
case "0011":
|
||||
case "0012":
|
||||
case "0013":
|
||||
case "0014":
|
||||
case "0015":
|
||||
case "0016":
|
||||
case "0017":
|
||||
case "0019":
|
||||
case "0031":
|
||||
case "0032":
|
||||
return "publication";
|
||||
default:
|
||||
return "publication";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package eu.dnetlib.doiboost.orcidnodoi.util;
|
||||
|
||||
public class Pair<K, V> {
|
||||
|
||||
private K k;
|
||||
|
||||
private V v;
|
||||
|
||||
public Pair(K k, V v) {
|
||||
this.k = k;
|
||||
this.v = v;
|
||||
}
|
||||
|
||||
public K getKey() {
|
||||
return k;
|
||||
}
|
||||
|
||||
public V getValue() {
|
||||
return v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj instanceof Pair<?, ?>) {
|
||||
Pair<?, ?> tmp = (Pair<?, ?>) obj;
|
||||
return k.equals(tmp.getKey()) && v.equals(tmp.getValue());
|
||||
} else return false;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
{
|
||||
"reference-entry": {"cobj":"0013", "value": "Part of book or chapter of book"},
|
||||
"report": {"cobj":"0017", "value": "Report"},
|
||||
"dataset": {"cobj":"0021", "value": "Dataset"},
|
||||
"journal-article": {"cobj":"0001", "value": "Article"},
|
||||
"reference-book": {"cobj":"0002", "value": "Book"},
|
||||
"other": {"cobj":"0020", "value": "Other ORP type"},
|
||||
"proceedings-article": {"cobj":"0004", "value": "Conference object"},
|
||||
"standard": {"cobj":"0038", "value": "Other literature type"},
|
||||
"book-part": {"cobj":"0002", "value": "Book"},
|
||||
"monograph": {"cobj":"0002", "value": "Book"},
|
||||
"report-series": {"cobj":"0017", "value": "Report"},
|
||||
"book": {"cobj":"0002", "value": "Book"},
|
||||
"book-chapter": {"cobj":"0013", "value": "Part of book or chapter of book"},
|
||||
"peer-review": {"cobj":"0015", "value": "Review"},
|
||||
"book-section": {"cobj":"0013", "value": "Part of book or chapter of book"},
|
||||
"book-review": {"cobj":"0015", "value": "Review"},
|
||||
"conference-abstract": {"cobj":"0004", "value": "Conference object"},
|
||||
"conference-paper": {"cobj":"0004", "value": "Conference object"},
|
||||
"conference-poster": {"cobj":"0004", "value": "Conference object"},
|
||||
"data-set": {"cobj":"0021", "value": "Dataset"},
|
||||
"dictionary-entry": {"cobj":"0038", "value": "Other literature type"},
|
||||
"disclosure": {"cobj":"0038", "value": "Other literature type"},
|
||||
"dissertation": {"cobj":"0006", "value": "Doctoral thesis"},
|
||||
"edited-book": {"cobj":"0002", "value": "Book"},
|
||||
"encyclopedia-entry": {"cobj":"0038", "value": "Other literature type"},
|
||||
"lecture-speech": {"cobj":"0010", "value": "Lecture"},
|
||||
"license": {"cobj":"0038", "value": "Other literature type"},
|
||||
"magazine-article": {"cobj":"0005", "value": "Contribution for newspaper or weekly magazine"},
|
||||
"manual": {"cobj":"0038", "value": "Other literature type"},
|
||||
"newsletter-article": {"cobj":"0012", "value": "Newsletter"},
|
||||
"newspaper-article": {"cobj":"0005", "value": "Contribution for newspaper or weekly magazine"},
|
||||
"patent": {"cobj":"0019", "value": "Patent"},
|
||||
"research-technique": {"cobj":"0020", "value": "Other ORP type"},
|
||||
"research-tool": {"cobj":"0020", "value": "Other ORP type"},
|
||||
"standards-and-policy": {"cobj":"0038", "value": "Other literature type"},
|
||||
"supervised-student-publication": {"cobj":"0001", "value": "Article"},
|
||||
"technical-standard": {"cobj":"0038", "value": "Other literature type"},
|
||||
"website": {"cobj":"0020", "value": "Other ORP type"},
|
||||
"working-paper": {"cobj":"0014", "value": "Research"}
|
||||
}
|
Loading…
Reference in New Issue