diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java index 80ccd71a18..02d2b267b8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java @@ -19,9 +19,9 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; public class ActivitiesDecompressor { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index 603bfedf66..29d72ed0b5 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -19,9 +19,9 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; public class SummariesDecompressor { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java index 13a3cee8fa..bfd6f7447d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java @@ -3,6 +3,7 @@ package eu.dnetlib.doiboost.orcid.json; import com.google.gson.Gson; import com.google.gson.JsonObject; + import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; public class JsonHelper { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index 7eb6faf545..506641b813 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -1,10 +1,12 @@ package eu.dnetlib.doiboost.orcidnodoi; -import eu.dnetlib.doiboost.orcid.json.JsonHelper; -import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; -import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URI; + import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.hadoop.conf.Configuration; @@ -17,11 +19,10 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URI; +import eu.dnetlib.doiboost.orcid.json.JsonHelper; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; public class ActivitiesDumpReader { @@ -82,7 +83,8 @@ public class ActivitiesDumpReader { while ((line = br.readLine()) != null) { buffer.append(line); } - WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes()); + WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi + .VTDParseWorkData(buffer.toString().getBytes()); if (workDataNoDoi != null) { if (workDataNoDoi.getErrorCode() != null) { errorFromOrcidFound += 1; @@ -94,9 +96,11 @@ public class ActivitiesDumpReader { + entry.getName()); continue; } - boolean isDoiFound = workDataNoDoi.getExtIds().stream() - .filter(e -> e.getType()!=null) - .anyMatch(e -> e.getType().equals("doi")); + boolean isDoiFound = workDataNoDoi + .getExtIds() + .stream() + .filter(e -> e.getType() != null) + .anyMatch(e -> e.getType().equals("doi")); if (!isDoiFound) { String jsonData = JsonHelper.createOidWork(workDataNoDoi); Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java index b82f4bc4ca..bbaa5acca3 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -1,15 +1,16 @@ package eu.dnetlib.doiboost.orcidnodoi; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.doiboost.orcid.OrcidDSManager; +import java.io.IOException; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.mortbay.log.Log; -import java.io.IOException; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcid.OrcidDSManager; public class GenOrcidAuthorWork extends OrcidDSManager { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 6bb31bcf60..9d9c5bc4a4 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -1,13 +1,12 @@ package eu.dnetlib.doiboost.orcidnodoi; -import com.google.gson.Gson; -import com.google.gson.JsonElement; -import com.google.gson.JsonParser; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.doiboost.orcid.model.AuthorData; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; -import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.IOException; +import java.util.Objects; +import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; @@ -19,14 +18,17 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import com.google.gson.JsonParser; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import scala.Tuple2; -import java.io.IOException; -import java.util.Objects; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - public class SparkGenEnrichedOrcidWorks { public static void main(String[] args) throws IOException, Exception { @@ -67,27 +69,28 @@ public class SparkGenEnrichedOrcidWorks { Encoders.bean(AuthorData.class)); JavaPairRDD activitiesRDD = sc - .sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class); + .sequenceFile(workingPath + outputWorksPath + "works_X.seq", Text.class, Text.class); Dataset activitiesDataset = spark .createDataset( activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(), Encoders.bean(WorkDataNoDoi.class)); activitiesDataset - .joinWith( - summariesDataset, - activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner") - .map( - (MapFunction, Tuple2>) value -> { - WorkDataNoDoi w = value._1; - AuthorData a = value._2; - AuthorMatcher.match(a, w.getContributors()); - return new Tuple2<>(a.getOid(), w); - }, - Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class))) - .filter(Objects::nonNull) - .toJavaRDD() - .saveAsTextFile(workingPath + outputEnrichedWorksPath);; + .joinWith( + summariesDataset, + activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner") + .map( + (MapFunction, Tuple2>) value -> { + WorkDataNoDoi w = value._1; + AuthorData a = value._2; + AuthorMatcher.match(a, w.getContributors()); + return new Tuple2<>(a.getOid(), w); + }, + Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class))) + .filter(Objects::nonNull) + .toJavaRDD() + .saveAsTextFile(workingPath + outputEnrichedWorksPath); + ; }); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java new file mode 100644 index 0000000000..01b172359e --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/proto/ProtoWriter.java @@ -0,0 +1,427 @@ + +package eu.dnetlib.doiboost.orcidnodoi.proto; + +public class ProtoWriter { + +} +// +//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues; +//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype; +//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier; +//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue; +//import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate; +// +//import java.io.IOException; +//import java.io.InputStream; +//import java.util.ArrayList; +//import java.util.HashMap; +//import java.util.List; +//import java.util.Map; +// +//import org.apache.commons.io.IOUtils; +//import org.apache.commons.lang3.StringUtils; +// +//import com.google.gson.Gson; +//import com.google.gson.JsonArray; +//import com.google.gson.JsonElement; +//import com.google.gson.JsonObject; +//import com.googlecode.protobuf.format.JsonFormat; +// +//import eu.dnetlib.actionmanager.actions.ActionFactory; +//import eu.dnetlib.actionmanager.actions.AtomicAction; +//import eu.dnetlib.actionmanager.common.Agent; +//import eu.dnetlib.data.mapreduce.hbase.Reporter; +//import eu.dnetlib.data.mapreduce.util.StreamUtils; +//import eu.dnetlib.data.proto.FieldTypeProtos; +//import eu.dnetlib.data.proto.FieldTypeProtos.Author; +//import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo; +//import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue; +//import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; +//import eu.dnetlib.data.proto.FieldTypeProtos.StringField; +//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; +//import eu.dnetlib.data.proto.KindProtos; +//import eu.dnetlib.data.proto.OafProtos; +//import eu.dnetlib.data.proto.ResultProtos; +//import eu.dnetlib.data.proto.TypeProtos; +//import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions; +//import eu.dnetlib.miscutils.collections.Pair; +//import eu.dnetlib.miscutils.datetime.DateUtils; +//import eu.dnetlib.pace.model.Person; +// +//public class ProtoWriter { +// +// public static final String ORCID = "ORCID"; +// public final static String orcidPREFIX = "orcid_______"; +// public static final String OPENAIRE_PREFIX = "openaire____"; +// public static final String SEPARATOR = "::"; +// +// private static Map> datasources = new HashMap>() { +// +// { +// put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); +// +// } +// }; +// +// // json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname +// private static Map> externalIds = new HashMap>() { +// +// { +// put("ark".toLowerCase(), new Pair<>("ark", "ark")); +// put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv")); +// put("pmc".toLowerCase(), new Pair<>("pmc", "pmc")); +// put("pmid".toLowerCase(), new Pair<>("pmid", "pmid")); +// put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid")); +// put("urn".toLowerCase(), new Pair<>("urn", "urn")); +// } +// }; +// +// static Map> typologiesMapping; +// +// static { +// try { +// final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json"); +// final String tt = IOUtils.toString(is); +// typologiesMapping = new Gson().fromJson(tt, Map.class); +// } catch (final IOException e) { +// e.printStackTrace(); +// } +// } +// +// public static final String PID_TYPES = "dnet:pid_types"; +// +// public static List generatePublicationActionsFromDump(final JsonObject rootElement, +// final ActionFactory factory, +// final String setName, +// final Agent agent, +// final Reporter context) { +// +// if (!isValid(rootElement, context)) { return null; } +// +// // Create OAF proto +// +// final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder(); +// +// oaf.setDataInfo( +// DataInfo.newBuilder() +// .setDeletedbyinference(false) +// .setInferred(false) +// .setTrust("0.9") +// .setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions")) +// .build()); +// +// // Adding kind +// oaf.setKind(KindProtos.Kind.entity); +// +// oaf.setLastupdatetimestamp(DateUtils.now()); +// +// // creating result proto +// final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result); +// +// entity.setDateofcollection("2018-10-22"); +// entity.setDateoftransformation(DateUtils.now_ISO8601()); +// +// // Adding external ids +// StreamUtils.toStream(externalIds.keySet().iterator()) +// .forEach(jsonExtId -> { +// final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue(); +// final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey(); +// final String extId = getStringValue(rootElement, jsonExtId); +// if (StringUtils.isNotBlank(extId)) { +// entity.addPid(StructuredProperty.newBuilder() +// .setValue(extId) +// .setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types") +// .setSchemename("dnet:pid_types").build()) +// .build()); +// } +// }); +// +// // Create result field +// final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder(); +// +// // Create metadata proto +// final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder(); +// +// // Adding source +// final String source = getStringValue(rootElement, "source"); +// if (StringUtils.isNotBlank(source)) { +// metadata.addSource(StringField.newBuilder().setValue(source).build()); +// } +// +// // Adding title +// final String title = createRepeatedField(rootElement, "titles"); +// if (StringUtils.isBlank(title)) { +// context.incrementCounter("filtered", "title_not_found", 1); +// return null; +// } +// metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder() +// .setValue(title) +// .setQualifier(getQualifier("main title", "dnet:dataCite_title")) +// .build()); +// +// // Adding identifier +// final String id = getStringValue(rootElement, "id"); +// String sourceId = null; +// if (id != null) { +// entity.addOriginalId(id); +// sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id)); +// } else { +// sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title)); +// } +// entity.setId(sourceId); +// +// // Adding relevant date +// settingRelevantDate(rootElement, metadata, "publication_date", "issued", true); +// +// // Adding collectedfrom +// final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder() +// .setValue(ORCID) +// .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a") +// .build(); +// entity.addCollectedfrom(collectedFrom); +// +// // Adding type +// final String type = getStringValue(rootElement, "type"); +// String cobjValue = ""; +// if (StringUtils.isNotBlank(type)) { +// +// metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder() +// .setClassid(type) +// .setClassname(type) +// .setSchemeid("dnet:dataCite_resource") +// .setSchemename("dnet:dataCite_resource") +// .build()); +// +// final String typeValue = typologiesMapping.get(type).get("value"); +// cobjValue = typologiesMapping.get(type).get("cobj"); +// final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder(); +// +// // Adding hostedby +// instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder() +// .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c") +// .setValue("Unknown Repository") +// .build()); +// +// // Adding url +// final String url = createRepeatedField(rootElement, "urls"); +// if (StringUtils.isNotBlank(url)) { +// instance.addUrl(url); +// } +// +// final String pubDate = getPublicationDate(rootElement, "publication_date"); +// if (StringUtils.isNotBlank(pubDate)) { +// instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build()); +// } +// +// instance.setCollectedfrom(collectedFrom); +// +// // Adding accessright +// instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder() +// .setClassid("UNKNOWN") +// .setClassname("UNKNOWN") +// .setSchemeid("dnet:access_modes") +// .setSchemename("dnet:access_modes") +// .build()); +// +// // Adding type +// instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder() +// .setClassid(cobjValue) +// .setClassname(typeValue) +// .setSchemeid("dnet:publication_resource") +// .setSchemename("dnet:publication_resource") +// .build()); +// +// result.addInstance(instance); +// } else { +// context.incrementCounter("filtered", "type_not_found", 1); +// return null; +// } +// +// // Adding authors +// final List authors = createAuthors(rootElement); +// if (authors != null && authors.size() > 0) { +// metadata.addAllAuthor(authors); +// } else { +// context.incrementCounter("filtered", "author_not_found", 1); +// return null; +// } +// +// metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies")); +// result.setMetadata(metadata.build()); +// entity.setResult(result.build()); +// oaf.setEntity(entity.build()); +// +// final List actionList = new ArrayList<>(); +// +// actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray())); +// +//// System.out.println(JsonFormat.printToString(oaf.build())); +// return actionList; +// +// } +// +// public static List createAuthors(final JsonObject root) { +// +// final String authorsJSONFieldName = "authors"; +// +// if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) { +// +// final List authors = new ArrayList<>(); +// final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName); +// int firstCounter = 0; +// int defaultCounter = 0; +// int rank = 1; +// int currentRank = 0; +// +// for (final JsonElement item : jsonAuthors) { +// final JsonObject author = item.getAsJsonObject(); +// final Author.Builder result = Author.newBuilder(); +// if (item.isJsonObject()) { +// final String surname = getStringValue(author, "surname"); +// final String name = getStringValue(author, "name"); +// final String oid = getStringValue(author, "oid"); +// final String seq = getStringValue(author, "seq"); +// if (StringUtils.isNotBlank(seq)) { +// if (seq.equals("first")) { +// firstCounter += 1; +// rank = firstCounter; +// +// } else if (seq.equals("additional")) { +// rank = currentRank + 1; +// } else { +// defaultCounter += 1; +// rank = defaultCounter; +// } +// } +// +// if (StringUtils.isNotBlank(oid)) { +// result.addPid(KeyValue.newBuilder() +// .setValue(oid) +// .setKey("ORCID") +// .build()); +// result.setFullname(name + " " + surname); +// if (StringUtils.isNotBlank(name)) { +// result.setName(name); +// } +// if (StringUtils.isNotBlank(surname)) { +// result.setSurname(surname); +// } +// } else { +// String fullname = ""; +// if (StringUtils.isNotBlank(name)) { +// fullname = name; +// } else { +// if (StringUtils.isNotBlank(surname)) { +// fullname = surname; +// } +// } +// Person p = new Person(fullname, false); +// if (p.isAccurate()) { +// result.setName(p.getNormalisedFirstName()); +// result.setSurname(p.getNormalisedSurname()); +// result.setFullname(p.getNormalisedFullname()); +// } +// else { +// result.setFullname(fullname); +// } +// } +// } +// result.setRank(rank); +// authors.add(result.build()); +// currentRank = rank; +// } +// return authors; +// +// } +// return null; +// } +// +// private static String createRepeatedField(final JsonObject rootElement, final String fieldName) { +// String field = ""; +// if (!rootElement.has(fieldName)) { return null; } +// if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; } +// if (rootElement.get(fieldName).isJsonArray()) { +// if (!isValidJsonArray(rootElement, fieldName)) { return null; } +// final StringBuilder ttl = new StringBuilder(); +// getArrayValues(rootElement, fieldName).forEach(ttl::append); +// field = ttl.toString(); +// } else { +// field = getStringValue(rootElement, fieldName); +// } +// +// if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') { +// field = field.substring(1, field.length() - 1); +// } +// return field; +// } +// +// private static void settingRelevantDate(final JsonObject rootElement, +// final ResultProtos.Result.Metadata.Builder metadata, +// final String jsonKey, +// final String dictionaryKey, +// final boolean addToDateOfAcceptance) { +// +// final String pubDate = getPublicationDate(rootElement, "publication_date"); +// if (StringUtils.isNotBlank(pubDate)) { +// if (addToDateOfAcceptance) { +// metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build()); +// } +// metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder() +// .setValue(pubDate) +// .setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date")) +// .build()); +// } +// } +// +// private static String getPublicationDate(final JsonObject rootElement, +// final String jsonKey) { +// +// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); +// if (pubDateJson == null) { return null; } +// final String year = getStringValue(pubDateJson, "year"); +// final String month = getStringValue(pubDateJson, "month"); +// final String day = getStringValue(pubDateJson, "day"); +// +// if (StringUtils.isBlank(year)) { return null; } +// String pubDate = "".concat(year); +// if (StringUtils.isNotBlank(month)) { +// pubDate = pubDate.concat("-" + month); +// if (StringUtils.isNotBlank(day)) { +// pubDate = pubDate.concat("-" + day); +// } else { +// pubDate += "-01"; +// } +// } else { +// pubDate += "-01-01"; +// } +// if (isValidDate(pubDate)) { return pubDate; } +// return null; +// } +// +// protected static boolean isValid(final JsonObject rootElement, final Reporter context) { +// +// final String type = getStringValue(rootElement, "type"); +// if (!typologiesMapping.containsKey(type)) { +// context.incrementCounter("filtered", "unknowntype_" + type, 1); +// return false; +// } +// +// if (!isValidJsonArray(rootElement, "titles")) { +// context.incrementCounter("filtered", "invalid_title", 1); +// return false; +// } +// return true; +// } +// +// private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) { +// if (!rootElement.has(fieldName)) { return false; } +// final JsonElement jsonElement = rootElement.get(fieldName); +// if (jsonElement.isJsonNull()) { return false; } +// if (jsonElement.isJsonArray()) { +// final JsonArray jsonArray = jsonElement.getAsJsonArray(); +// if (jsonArray.isJsonNull()) { return false; } +// if (jsonArray.get(0).isJsonNull()) { return false; } +// } +// return true; +// } +//} diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml index 2486bdb241..33fbdf8756 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml @@ -509,7 +509,7 @@ cluster Gen_Enriched_Orcid_Works eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks - dhp-doiboost-1.2.3-SNAPSHOT.jar + dhp-doiboost-1.2.2-SNAPSHOT.jar --num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} -w${workingPath}/