[DOIBoost author merger] merge with beta and added new test to verify the match for 0000-0002-4333-2748

This commit is contained in:
Miriam Baglioni 2023-05-23 15:44:03 +02:00
commit 2aaa63dfa2
681 changed files with 38544 additions and 21694 deletions

2
.gitignore vendored
View File

@ -25,4 +25,4 @@ spark-warehouse
/**/job-override.properties
/**/*.log
/**/.factorypath
/**/.scalafmt.conf

21
.scalafmt.conf Normal file
View File

@ -0,0 +1,21 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
<packaging>jar</packaging>
@ -47,12 +47,16 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>3.9.1</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
</properties>

View File

@ -0,0 +1,21 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-build</artifactId>
<packaging>pom</packaging>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@ -10,6 +10,12 @@ public class Constants {
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
public static final String ROR_NS_PREFIX = "ror_________";
public static final String ROR_OPENAIRE_ID = "10|openaire____::993a7ae7a863813cf95028b50708e222";
public static final String ROR_DATASOURCE_NAME = "Research Organization Registry (ROR)";
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
private Constants() {

View File

@ -0,0 +1,100 @@
package eu.dnetlib.dhp.common;
/**
* This utility represent the Metadata Store information
* needed during the migration from mongo to HDFS to store
*/
public class MDStoreInfo {
private String mdstore;
private String currentId;
private Long latestTimestamp;
/**
* Instantiates a new Md store info.
*/
public MDStoreInfo() {
}
/**
* Instantiates a new Md store info.
*
* @param mdstore the mdstore
* @param currentId the current id
* @param latestTimestamp the latest timestamp
*/
public MDStoreInfo(String mdstore, String currentId, Long latestTimestamp) {
this.mdstore = mdstore;
this.currentId = currentId;
this.latestTimestamp = latestTimestamp;
}
/**
* Gets mdstore.
*
* @return the mdstore
*/
public String getMdstore() {
return mdstore;
}
/**
* Sets mdstore.
*
* @param mdstore the mdstore
* @return the mdstore
*/
public MDStoreInfo setMdstore(String mdstore) {
this.mdstore = mdstore;
return this;
}
/**
* Gets current id.
*
* @return the current id
*/
public String getCurrentId() {
return currentId;
}
/**
* Sets current id.
*
* @param currentId the current id
* @return the current id
*/
public MDStoreInfo setCurrentId(String currentId) {
this.currentId = currentId;
return this;
}
/**
* Gets latest timestamp.
*
* @return the latest timestamp
*/
public Long getLatestTimestamp() {
return latestTimestamp;
}
/**
* Sets latest timestamp.
*
* @param latestTimestamp the latest timestamp
* @return the latest timestamp
*/
public MDStoreInfo setLatestTimestamp(Long latestTimestamp) {
this.latestTimestamp = latestTimestamp;
return this;
}
@Override
public String toString() {
return "MDStoreInfo{" +
"mdstore='" + mdstore + '\'' +
", currentId='" + currentId + '\'' +
", latestTimestamp=" + latestTimestamp +
'}';
}
}

View File

@ -5,13 +5,71 @@ import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class MakeTarArchive implements Serializable {
private static final Logger log = LoggerFactory.getLogger(MakeTarArchive.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
MakeTarArchive.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/input_maketar_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", outputPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("nameNode: {}", hdfsNameNode);
final String inputPath = parser.get("sourcePath");
log.info("input path : {}", inputPath);
final int gBperSplit = Optional
.ofNullable(parser.get("splitSize"))
.map(Integer::valueOf)
.orElse(10);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit);
}
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit)
throws IOException {
RemoteIterator<LocatedFileStatus> dirIterator = fileSystem.listLocatedStatus(new Path(inputPath));
while (dirIterator.hasNext()) {
LocatedFileStatus fileStatus = dirIterator.next();
Path p = fileStatus.getPath();
String pathString = p.toString();
String entity = pathString.substring(pathString.lastIndexOf("/") + 1);
MakeTarArchive.tarMaxSize(fileSystem, pathString, outputPath + "/" + entity, entity, gBperSplit);
}
}
private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
Path hdfsWritePath = new Path(outputPath);
if (fileSystem.exists(hdfsWritePath)) {
@ -21,7 +79,7 @@ public class MakeTarArchive implements Serializable {
return new TarArchiveOutputStream(fileSystem.create(hdfsWritePath).getWrappedStream());
}
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dirName)
throws IOException {
Path hdfsWritePath = new Path(outputPath);
@ -37,7 +95,7 @@ public class MakeTarArchive implements Serializable {
new Path(inputPath), true);
while (iterator.hasNext()) {
writeCurrentFile(fileSystem, dir_name, iterator, ar, 0);
writeCurrentFile(fileSystem, dirName, iterator, ar, 0);
}
}
@ -59,32 +117,30 @@ public class MakeTarArchive implements Serializable {
new Path(inputPath), true);
boolean next = fileStatusListIterator.hasNext();
while (next) {
TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar");
try (TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar")) {
long current_size = 0;
while (next && current_size < bytesPerSplit) {
current_size = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, current_size);
next = fileStatusListIterator.hasNext();
long currentSize = 0;
while (next && currentSize < bytesPerSplit) {
currentSize = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, currentSize);
next = fileStatusListIterator.hasNext();
}
partNum += 1;
}
partNum += 1;
ar.close();
}
}
}
private static long writeCurrentFile(FileSystem fileSystem, String dir_name,
private static long writeCurrentFile(FileSystem fileSystem, String dirName,
RemoteIterator<LocatedFileStatus> fileStatusListIterator,
TarArchiveOutputStream ar, long current_size) throws IOException {
TarArchiveOutputStream ar, long currentSize) throws IOException {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
String pString = p.toString();
if (!pString.endsWith("_SUCCESS")) {
String name = pString.substring(pString.lastIndexOf("/") + 1);
if (name.startsWith("part-") & name.length() > 10) {
String tmp = name.substring(0, 10);
if (name.contains(".")) {
@ -92,9 +148,9 @@ public class MakeTarArchive implements Serializable {
}
name = tmp;
}
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
TarArchiveEntry entry = new TarArchiveEntry(dirName + "/" + name);
entry.setSize(fileStatus.getLen());
current_size += fileStatus.getLen();
currentSize += fileStatus.getLen();
ar.putArchiveEntry(entry);
InputStream is = fileSystem.open(fileStatus.getPath());
@ -110,7 +166,7 @@ public class MakeTarArchive implements Serializable {
ar.closeArchiveEntry();
}
return current_size;
return currentSize;
}
}

View File

@ -1,12 +1,12 @@
package eu.dnetlib.dhp.common;
import static com.mongodb.client.model.Sorts.descending;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
@ -38,6 +38,26 @@ public class MdstoreClient implements Closeable {
this.db = getDb(client, dbName);
}
private Long parseTimestamp(Document f) {
if (f == null || !f.containsKey("timestamp"))
return null;
Object ts = f.get("timestamp");
return Long.parseLong(ts.toString());
}
public Long getLatestTimestamp(final String collectionId) {
MongoCollection<Document> collection = db.getCollection(collectionId);
FindIterable<Document> result = collection.find().sort(descending("timestamp")).limit(1);
if (result == null) {
return null;
}
Document f = result.first();
return parseTimestamp(f);
}
public MongoCollection<Document> mdStore(final String mdId) {
BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();
@ -54,6 +74,16 @@ public class MdstoreClient implements Closeable {
return getColl(db, currentId, true);
}
public List<MDStoreInfo> mdStoreWithTimestamp(final String mdFormat, final String mdLayout,
final String mdInterpretation) {
Map<String, String> res = validCollections(mdFormat, mdLayout, mdInterpretation);
return res
.entrySet()
.stream()
.map(e -> new MDStoreInfo(e.getKey(), e.getValue(), getLatestTimestamp(e.getValue())))
.collect(Collectors.toList());
}
public Map<String, String> validCollections(
final String mdFormat, final String mdLayout, final String mdInterpretation) {

View File

@ -1,18 +1,18 @@
package eu.dnetlib.dhp.common;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.text.WordUtils;
import com.ctc.wstx.dtd.LargePrefixedNameSet;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
@ -29,7 +29,19 @@ public class PacePerson {
private List<String> fullname = Lists.newArrayList();
private final String original;
private static Set<String> particles = null;
private static Set<String> particles;
static {
try {
particles = new HashSet<>(IOUtils
.readLines(
PacePerson.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt")));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Capitalizes a string
@ -37,29 +49,20 @@ public class PacePerson {
* @param s the string to capitalize
* @return the input string with capital letter
*/
public static final String capitalize(final String s) {
public static String capitalize(final String s) {
if (particles.contains(s)) {
return s;
}
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
}
/**
* Adds a dot to a string with length equals to 1
*/
public static final String dotAbbreviations(final String s) {
public static String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s;
}
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = new HashSet<>();
try {
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
h.add(s);
}
} catch (final Throwable e) {
return new HashSet<>();
}
return h;
}
/**
* The constructor of the class. It fills the fields of the class basing on the input fullname.
*
@ -128,10 +131,6 @@ public class PacePerson {
}
private List<String> splitTerms(final String s) {
if (particles == null) {
particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt");
}
final List<String> list = Lists.newArrayList();
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (!particles.contains(part.toLowerCase())) {
@ -187,17 +186,36 @@ public class PacePerson {
}
public List<String> getCapitalFirstnames() {
return Lists
.newArrayList(
Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
return Optional
.ofNullable(getNameWithAbbreviations())
.map(
name -> name
.stream()
.map(PacePerson::capitalize)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
}
public List<String> getCapitalSurname() {
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
return Optional
.ofNullable(getSurname())
.map(
surname -> surname
.stream()
.map(PacePerson::capitalize)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
}
public List<String> getNameWithAbbreviations() {
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
return Optional
.ofNullable(getName())
.map(
name -> name
.stream()
.map(PacePerson::dotAbbreviations)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
}
public boolean isAccurate() {

View File

@ -0,0 +1,81 @@
package eu.dnetlib.dhp.common.action;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.sql.SQLException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class ReadDatasourceMasterDuplicateFromDB {
private static final Logger log = LoggerFactory.getLogger(ReadDatasourceMasterDuplicateFromDB.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId "
+
"FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);";
public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
throws IOException {
int count = 0;
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
FSDataOutputStream fos = fileSystem.create(new Path(hdfsPath));
log.info("running query: {}", QUERY);
log.info("storing results in: {}", hdfsPath);
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
dbClient.processResults(QUERY, rs -> writeMap(datasourceMasterMap(rs), writer));
count++;
}
}
return count;
}
private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
try {
final MasterDuplicate md = new MasterDuplicate();
final String duplicateId = rs.getString("duplicateId");
final String masterId = rs.getString("masterId");
final String masterName = rs.getString("masterName");
md.setDuplicateId(OafMapperUtils.createOpenaireId(10, duplicateId, true));
md.setMasterId(OafMapperUtils.createOpenaireId(10, masterId, true));
md.setMasterName(masterName);
return md;
} catch (final SQLException e) {
throw new RuntimeException(e);
}
}
private static void writeMap(final MasterDuplicate dm, final BufferedWriter writer) {
try {
writer.write(OBJECT_MAPPER.writeValueAsString(dm));
writer.newLine();
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.common.action.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 21/07/22
*/
public class MasterDuplicate implements Serializable {
private String duplicateId;
private String masterId;
private String masterName;
public String getDuplicateId() {
return duplicateId;
}
public void setDuplicateId(String duplicateId) {
this.duplicateId = duplicateId;
}
public String getMasterId() {
return masterId;
}
public void setMasterId(String masterId) {
this.masterId = masterId;
}
public String getMasterName() {
return masterName;
}
public void setMasterName(String masterName) {
this.masterName = masterName;
}
}

View File

@ -3,10 +3,13 @@ package eu.dnetlib.dhp.common.api;
import java.io.*;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHeaders;
import org.apache.http.entity.ContentType;
import org.jetbrains.annotations.NotNull;
import com.google.gson.Gson;
@ -60,33 +63,31 @@ public class ZenodoAPIClient implements Serializable {
*/
public int newDeposition() throws IOException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
// Get response body
json = response.body().string();
ZenodoModel newSubmission = new Gson().fromJson(json, ZenodoModel.class);
this.bucket = newSubmission.getLinks().getBucket();
this.deposition_id = newSubmission.getId();
return response.code();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setRequestMethod("POST");
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
this.bucket = newSubmission.getLinks().getBucket();
this.deposition_id = newSubmission.getId();
return responseCode;
}
/**
@ -94,28 +95,48 @@ public class ZenodoAPIClient implements Serializable {
*
* @param is the inputStream for the file to upload
* @param file_name the name of the file as it will appear on Zenodo
* @param len the size of the file
* @return the response code
*/
public int uploadIS(InputStream is, String file_name, long len) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder()
.writeTimeout(600, TimeUnit.SECONDS)
.readTimeout(600, TimeUnit.SECONDS)
.connectTimeout(600, TimeUnit.SECONDS)
.build();
public int uploadIS(InputStream is, String file_name) throws IOException {
Request request = new Request.Builder()
.url(bucket + "/" + file_name)
.addHeader(HttpHeaders.CONTENT_TYPE, "application/zip") // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.put(InputStreamRequestBody.create(MEDIA_TYPE_ZIP, is, len))
.build();
URL url = new URL(bucket + "/" + file_name);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("PUT");
byte[] buf = new byte[8192];
int length;
try (OutputStream os = conn.getOutputStream()) {
while ((length = is.read(buf)) != -1) {
os.write(buf, 0, length);
}
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
}
int responseCode = conn.getResponseCode();
if (!checkOKStatus(responseCode)) {
throw new IOException("Unexpected code " + responseCode + getBody(conn));
}
return responseCode;
}
@NotNull
private String getBody(HttpURLConnection conn) throws IOException {
String body = "{}";
try (BufferedReader br = new BufferedReader(
new InputStreamReader(conn.getInputStream(), "utf-8"))) {
StringBuilder response = new StringBuilder();
String responseLine = null;
while ((responseLine = br.readLine()) != null) {
response.append(responseLine.trim());
}
body = response.toString();
}
return body;
}
/**
@ -127,26 +148,34 @@ public class ZenodoAPIClient implements Serializable {
*/
public int sendMretadata(String metadata) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
URL url = new URL(urlString + "/" + deposition_id);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("PUT");
RequestBody body = RequestBody.create(metadata, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.put(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
try (OutputStream os = conn.getOutputStream()) {
byte[] input = metadata.getBytes("utf-8");
os.write(input, 0, input.length);
}
final int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + getBody(conn));
return responseCode;
}
private boolean checkOKStatus(int responseCode) {
if (HttpURLConnection.HTTP_OK != responseCode ||
HttpURLConnection.HTTP_CREATED != responseCode)
return true;
return false;
}
/**
@ -155,6 +184,7 @@ public class ZenodoAPIClient implements Serializable {
* @return response code
* @throws IOException
*/
@Deprecated
public int publish() throws IOException {
String json = "{}";
@ -191,31 +221,37 @@ public class ZenodoAPIClient implements Serializable {
* @throws MissingConceptDoiException
*/
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
setDepositionId(concept_rec_id);
setDepositionId(concept_rec_id, 1);
String json = "{}";
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("POST");
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/newversion")
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
String latest_draft = zenodoModel.getLinks().getLatest_draft();
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
bucket = getBucket(latest_draft);
return response.code();
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
String latest_draft = zenodoModel.getLinks().getLatest_draft();
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
bucket = getBucket(latest_draft);
return responseCode;
}
/**
@ -233,29 +269,38 @@ public class ZenodoAPIClient implements Serializable {
this.deposition_id = deposition_id;
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
String json = "{}";
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader("Authorization", "Bearer " + access_token)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
bucket = zenodoModel.getLinks().getBucket();
return response.code();
URL url = new URL(urlString + "/" + deposition_id);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setRequestMethod("POST");
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
bucket = zenodoModel.getLinks().getBucket();
return responseCode;
}
private void setDepositionId(String concept_rec_id) throws IOException, MissingConceptDoiException {
private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
ZenodoModelList zenodoModelList = new Gson().fromJson(getPrevDepositions(), ZenodoModelList.class);
ZenodoModelList zenodoModelList = new Gson()
.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);
for (ZenodoModel zm : zenodoModelList) {
if (zm.getConceptrecid().equals(concept_rec_id)) {
@ -263,55 +308,57 @@ public class ZenodoAPIClient implements Serializable {
return;
}
}
throw new MissingConceptDoiException("The concept record id specified was missing in the list of depositions");
if (zenodoModelList.size() == 0)
throw new MissingConceptDoiException(
"The concept record id specified was missing in the list of depositions");
setDepositionId(concept_rec_id, page + 1);
}
private String getPrevDepositions() throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
private String getPrevDepositions(String page) throws IOException {
Request request = new Request.Builder()
.url(urlString)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.get()
.build();
HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
urlBuilder.addQueryParameter("page", page);
try (Response response = httpClient.newCall(request).execute()) {
URL url = new URL(urlBuilder.build().toString());
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("GET");
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
String body = getBody(conn);
return response.body().string();
int responseCode = conn.getResponseCode();
}
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
return body;
}
private String getBucket(String url) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder()
.connectTimeout(600, TimeUnit.SECONDS)
.build();
private String getBucket(String inputUurl) throws IOException {
Request request = new Request.Builder()
.url(url)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.get()
.build();
URL url = new URL(inputUurl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("GET");
try (Response response = httpClient.newCall(request).execute()) {
String body = getBody(conn);
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
int responseCode = conn.getResponseCode();
// Get response body
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
return zenodoModel.getLinks().getBucket();
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
}
return zenodoModel.getLinks().getBucket();
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.common.collection;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class DecompressTarGz {
public static void doExtract(FileSystem fs, String outputPath, String tarGzPath) throws IOException {
FSDataInputStream inputFileStream = fs.open(new Path(tarGzPath));
try (TarArchiveInputStream tais = new TarArchiveInputStream(
new GzipCompressorInputStream(inputFileStream))) {
TarArchiveEntry entry = null;
while ((entry = tais.getNextTarEntry()) != null) {
if (!entry.isDirectory()) {
try (
FSDataOutputStream out = fs
.create(new Path(outputPath.concat(entry.getName()).concat(".gz")));
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
IOUtils.copy(tais, gzipOs);
}
}
}
}
}
}

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
@ -66,21 +67,39 @@ public class Vocabulary implements Serializable {
}
public Qualifier getTermAsQualifier(final String termId) {
if (StringUtils.isBlank(termId)) {
return getTermAsQualifier(termId, false);
}
public Qualifier getTermAsQualifier(final String termId, boolean strict) {
final VocabularyTerm term = getTerm(termId);
if (Objects.nonNull(term)) {
return OafMapperUtils.qualifier(term.getId(), term.getName(), getId(), getName());
} else if (Objects.isNull(term) && strict) {
return OafMapperUtils.unknown(getId(), getName());
} else if (termExists(termId)) {
final VocabularyTerm t = getTerm(termId);
return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
} else {
return OafMapperUtils.qualifier(termId, termId, getId(), getName());
}
}
public Qualifier getSynonymAsQualifier(final String syn) {
return getSynonymAsQualifier(syn, false);
}
public Qualifier getSynonymAsQualifier(final String syn, boolean strict) {
return Optional
.ofNullable(getTermBySynonym(syn))
.map(term -> getTermAsQualifier(term.getId()))
.map(term -> getTermAsQualifier(term.getId(), strict))
.orElse(null);
}
public Qualifier lookup(String id) {
return lookup(id, false);
}
public Qualifier lookup(String id, boolean strict) {
return Optional
.ofNullable(getSynonymAsQualifier(id, strict))
.orElse(getTermAsQualifier(id, strict));
}
}

View File

@ -81,6 +81,13 @@ public class VocabularyGroup implements Serializable {
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
}
public Optional<Vocabulary> find(final String vocId) {
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(vocs::get);
}
public void addTerm(final String vocId, final String id, final String name) {
if (vocabularyExists(vocId)) {
vocs.get(vocId.toLowerCase()).addTerm(id, name);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.dedup;
package eu.dnetlib.dhp.oa.merge;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -38,7 +38,7 @@ public class DispatchEntitiesSparkJob {
.requireNonNull(
DispatchEntitiesSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/dispatch_entities_parameters.json")));
"/eu/dnetlib/dhp/oa/merge/dispatch_entities_parameters.json")));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.dedup;
package eu.dnetlib.dhp.oa.merge;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
@ -53,7 +53,7 @@ public class GroupEntitiesSparkJob {
.toString(
GroupEntitiesSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/group_graph_entities_parameters.json"));
"/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -1,6 +1,8 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
@ -11,12 +13,15 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -35,6 +40,127 @@ public class GraphCleaningFunctions extends CleaningFunctions {
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) {
if (ModelSupport.isSubClass(value, Result.class)) {
final Result res = (Result) value;
if (shouldCleanContext(res, verifyParam)) {
res
.setContext(
res
.getContext()
.stream()
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
.collect(Collectors.toList()));
}
return (T) res;
} else {
return value;
}
}
private static boolean shouldCleanContext(Result res, String verifyParam) {
boolean titleMatch = res
.getTitle()
.stream()
.filter(
t -> t
.getQualifier()
.getClassid()
.equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid()))
.anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()));
return titleMatch && Objects.nonNull(res.getContext());
}
public static <T extends Oaf> T cleanCountry(T value, String[] verifyParam, Set<String> hostedBy,
String collectedfrom, String country) {
if (ModelSupport.isSubClass(value, Result.class)) {
final Result res = (Result) value;
if (res.getInstance().stream().anyMatch(i -> hostedBy.contains(i.getHostedby().getKey())) ||
!res.getCollectedfrom().stream().anyMatch(cf -> cf.getValue().equals(collectedfrom))) {
return (T) res;
}
List<StructuredProperty> ids = getPidsAndAltIds(res).collect(Collectors.toList());
if (ids
.stream()
.anyMatch(
p -> p
.getQualifier()
.getClassid()
.equals(PidType.doi.toString()) && pidInParam(p.getValue(), verifyParam))) {
res
.setCountry(
res
.getCountry()
.stream()
.filter(
c -> toTakeCountry(c, country))
.collect(Collectors.toList()));
}
return (T) res;
} else {
return value;
}
}
private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
final Stream<StructuredProperty> resultPids = Optional
.ofNullable(r.getPid())
.map(Collection::stream)
.orElse(Stream.empty());
final Stream<StructuredProperty> instancePids = Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.flatMap(
i -> Optional
.ofNullable(i.getPid())
.map(Collection::stream)
.orElse(Stream.empty())))
.orElse(Stream.empty());
final Stream<StructuredProperty> instanceAltIds = Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.flatMap(
i -> Optional
.ofNullable(i.getAlternateIdentifier())
.map(Collection::stream)
.orElse(Stream.empty())))
.orElse(Stream.empty());
return Stream
.concat(
Stream.concat(resultPids, instancePids),
instanceAltIds);
}
private static boolean pidInParam(String value, String[] verifyParam) {
for (String s : verifyParam)
if (value.startsWith(s))
return true;
return false;
}
private static boolean toTakeCountry(Country c, String country) {
// If dataInfo is not set, or dataInfo.inferenceprovenance is not set or not present then it cannot be
// inserted via propagation
if (!Optional.ofNullable(c.getDataInfo()).isPresent())
return true;
if (!Optional.ofNullable(c.getDataInfo().getInferenceprovenance()).isPresent())
return true;
return !(c
.getClassid()
.equalsIgnoreCase(country) &&
c.getDataInfo().getInferenceprovenance().equals("propagation"));
}
public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) {
// nothing to clean here
@ -101,7 +227,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.orElse(true))
.orElse(true))
.orElse(true))) {
return false;
return true;
}
if (value instanceof Datasource) {
@ -191,8 +317,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getSubject())) {
r
.setSubject(
List<Subject> subjects = Lists
.newArrayList(
r
.getSubject()
.stream()
@ -200,8 +326,26 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(s -> {
if ("dnet:result_subject".equals(s.getQualifier().getClassid())) {
s.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_TYPOLOGIES);
s.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_TYPOLOGIES);
}
return s;
})
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
.collect(
Collectors
.toMap(
s -> Optional
.ofNullable(s.getQualifier())
.map(q -> q.getClassid() + s.getValue())
.orElse(s.getValue()),
Function.identity(),
(s1, s2) -> Collections
.min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator())))
.values());
r.setSubject(subjects);
}
if (Objects.nonNull(r.getTitle())) {
r
@ -321,7 +465,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
}
if (Objects.isNull(i.getRefereed())) {
if (Objects.isNull(i.getRefereed()) || StringUtils.isBlank(i.getRefereed().getClassid())) {
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
}
if (Objects.nonNull(i.getDateofacceptance())) {
@ -382,14 +526,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> {
// hack to distinguish orcid from orcid_pending
String pidProvenance = Optional
.ofNullable(p.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
String pidProvenance = getProvenance(p.getDataInfo());
if (p
.getQualifier()
.getClassid()
@ -520,6 +657,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return s;
}
protected static Subject cleanValue(Subject s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
protected static Field<String> cleanValue(Field<String> s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;

View File

@ -3,6 +3,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.sql.Array;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
@ -12,6 +14,7 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
@ -47,6 +50,17 @@ public class OafMapperUtils {
}
public static Result mergeResults(Result left, Result right) {
final boolean leftFromDelegatedAuthority = isFromDelegatedAuthority(left);
final boolean rightFromDelegatedAuthority = isFromDelegatedAuthority(right);
if (leftFromDelegatedAuthority && !rightFromDelegatedAuthority) {
return left;
}
if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
return right;
}
if (new ResultTypeComparator().compare(left, right) < 0) {
left.mergeFrom(right);
return left;
@ -56,6 +70,18 @@ public class OafMapperUtils {
}
}
private static boolean isFromDelegatedAuthority(Result r) {
return Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.filter(i -> Objects.nonNull(i.getCollectedfrom()))
.map(i -> i.getCollectedfrom().getKey())
.anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId)))
.orElse(false);
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
@ -95,6 +121,17 @@ public class OafMapperUtils {
.collect(Collectors.toList());
}
public static <T> List<T> listValues(Array values) throws SQLException {
if (Objects.isNull(values)) {
return null;
}
return Arrays
.stream((T[]) values.getArray())
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toList());
}
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
return values
.stream()
@ -105,7 +142,7 @@ public class OafMapperUtils {
}
public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
return qualifier(UNKNOWN, "Unknown", schemeid, schemename);
}
public static AccessRight accessRight(
@ -153,6 +190,17 @@ public class OafMapperUtils {
return q;
}
public static Subject subject(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
@ -164,6 +212,20 @@ public class OafMapperUtils {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static Subject subject(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final Subject s = new Subject();
s.setValue(value);
s.setQualifier(qualifier);
s.setDataInfo(dataInfo);
return s;
}
public static StructuredProperty structuredProperty(
final String value,
final Qualifier qualifier,
@ -368,4 +430,88 @@ public class OafMapperUtils {
}
return null;
}
public static KeyValue newKeyValueInstance(String key, String value, DataInfo dataInfo) {
KeyValue kv = new KeyValue();
kv.setDataInfo(dataInfo);
kv.setKey(key);
kv.setValue(value);
return kv;
}
public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
Measure m = new Measure();
m.setId(id);
m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo)));
return m;
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final OafEntity entity) {
return getRelation(source, target, relType, subRelType, relClass, entity, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final OafEntity entity,
final String validationDate) {
return getRelation(
source, target, relType, subRelType, relClass, entity.getCollectedfrom(), entity.getDataInfo(),
entity.getLastupdatetimestamp(), validationDate, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<KeyValue> collectedfrom,
final DataInfo dataInfo,
final Long lastupdatetimestamp) {
return getRelation(
source, target, relType, subRelType, relClass, collectedfrom, dataInfo, lastupdatetimestamp, null, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<KeyValue> collectedfrom,
final DataInfo dataInfo,
final Long lastupdatetimestamp,
final String validationDate,
final List<KeyValue> properties) {
final Relation rel = new Relation();
rel.setRelType(relType);
rel.setSubRelType(subRelType);
rel.setRelClass(relClass);
rel.setSource(source);
rel.setTarget(target);
rel.setCollectedfrom(collectedfrom);
rel.setDataInfo(dataInfo);
rel.setLastupdatetimestamp(lastupdatetimestamp);
rel.setValidated(StringUtils.isNotBlank(validationDate));
rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
rel.setProperties(properties);
return rel;
}
public static String getProvenance(DataInfo dataInfo) {
return Optional
.ofNullable(dataInfo)
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import static org.apache.commons.lang3.StringUtils.isBlank;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.Subject;
public class SubjectProvenanceComparator implements Comparator<Subject> {
@Override
public int compare(Subject left, Subject right) {
String lProv = getProvenance(left.getDataInfo());
String rProv = getProvenance(right.getDataInfo());
if (isBlank(lProv) && isBlank(rProv))
return 0;
if (isBlank(lProv))
return 1;
if (isBlank(rProv))
return -1;
if (lProv.equals(rProv))
return 0;
if (lProv.toLowerCase().contains("crosswalk"))
return -1;
if (rProv.toLowerCase().contains("crosswalk"))
return 1;
if (lProv.toLowerCase().contains("user"))
return -1;
if (rProv.toLowerCase().contains("user"))
return 1;
if (lProv.toLowerCase().contains("propagation"))
return -1;
if (rProv.toLowerCase().contains("propagation"))
return 1;
if (lProv.toLowerCase().contains("iis"))
return -1;
if (rProv.toLowerCase().contains("iis"))
return 1;
return 0;
}
}

View File

@ -75,9 +75,14 @@ public class DHPUtils {
final HttpGet req = new HttpGet(url);
log.info("MDStoreManager request: {}", req);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
try (final CloseableHttpResponse response = client.execute(req)) {
final String json = IOUtils.toString(response.getEntity().getContent());
log.info("MDStoreManager response: {}", json);
final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
return Arrays
.stream(mdstores)

View File

@ -2,71 +2,72 @@ package eu.dnetlib.dhp.application
import scala.io.Source
/**
* This is the main Interface SparkApplication
* where all the Spark Scala class should inherit
*
*/
/** This is the main Interface SparkApplication
* where all the Spark Scala class should inherit
*/
trait SparkScalaApplication {
/**
* This is the path in the classpath of the json
* describes all the argument needed to run
*/
/** This is the path in the classpath of the json
* describes all the argument needed to run
*/
val propertyPath: String
/**
* Utility to parse the arguments using the
* property json in the classpath identified from
* the variable propertyPath
*
* @param args the list of arguments
*/
/** Utility to parse the arguments using the
* property json in the classpath identified from
* the variable propertyPath
*
* @param args the list of arguments
*/
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
val parser = new ArgumentApplicationParser(
Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
)
parser.parseArgument(args)
parser
}
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
def run(): Unit
}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.slf4j.Logger
abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication {
abstract class AbstractScalaApplication(
val propertyPath: String,
val args: Array[String],
log: Logger
) extends SparkScalaApplication {
var parser: ArgumentApplicationParser = null
var spark:SparkSession = null
var spark: SparkSession = null
def initialize():SparkScalaApplication = {
def initialize(): SparkScalaApplication = {
parser = parseArguments(args)
spark = createSparkSession()
this
}
/**
* Utility for creating a spark session starting from parser
*
* @return a spark Session
*/
private def createSparkSession():SparkSession = {
require(parser!= null)
/** Utility for creating a spark session starting from parser
*
* @return a spark Session
*/
private def createSparkSession(): SparkSession = {
require(parser != null)
val conf:SparkConf = new SparkConf()
val conf: SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
SparkSession.builder().config(conf)
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
}
}
}

View File

@ -14,7 +14,6 @@ import scala.io.Source
object ScholixUtils extends Serializable {
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
val DATE_RELATION_KEY: String = "RelationDate"
@ -24,7 +23,11 @@ object ScholixUtils extends Serializable {
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
val relations: Map[String, RelationVocabulary] = {
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")).mkString
val input = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
)
.mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@ -32,13 +35,14 @@ object ScholixUtils extends Serializable {
json.extract[Map[String, RelationVocabulary]]
}
def extractRelationDate(relation: Relation): String = {
if (relation.getProperties == null || !relation.getProperties.isEmpty)
null
else {
val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue)
val date = relation.getProperties.asScala
.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
.map(p => p.getValue)
if (date.isDefined)
date.get
else
@ -58,78 +62,80 @@ object ScholixUtils extends Serializable {
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
}
def generateScholixResourceFromResult(r:Result) :ScholixResource = {
def generateScholixResourceFromResult(r: Result): ScholixResource = {
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
}
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
override def zero: RelatedEntities = null
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
override def zero: RelatedEntities = null
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
if (b == null)
RelatedEntities(a._1, relatedDataset, relatedPublication)
else
RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication)
}
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
if (b1 != null && b2 != null)
RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication)
else if (b1 != null)
b1
else
b2
}
override def finish(reduction: RelatedEntities): RelatedEntities = reduction
override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
}
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
override def zero: Scholix = null
def scholix_complete(s: Scholix): Boolean = {
if (s == null || s.getIdentifier == null) {
false
} else if (s.getSource == null || s.getTarget == null) {
false
if (b == null)
RelatedEntities(a._1, relatedDataset, relatedPublication)
else
RelatedEntities(
a._1,
b.relatedDataset + relatedDataset,
b.relatedPublication + relatedPublication
)
}
else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
false
else
true
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
if (b1 != null && b2 != null)
RelatedEntities(
b1.id,
b1.relatedDataset + b2.relatedDataset,
b1.relatedPublication + b2.relatedPublication
)
else if (b1 != null)
b1
else
b2
}
override def finish(reduction: RelatedEntities): RelatedEntities = reduction
override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
}
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
if (scholix_complete(b)) b else a._2
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
override def zero: Scholix = null
def scholix_complete(s: Scholix): Boolean = {
if (s == null || s.getIdentifier == null) {
false
} else if (s.getSource == null || s.getTarget == null) {
false
} else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
false
else
true
}
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
if (scholix_complete(b)) b else a._2
}
override def merge(b1: Scholix, b2: Scholix): Scholix = {
if (scholix_complete(b1)) b1 else b2
}
override def finish(reduction: Scholix): Scholix = reduction
override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
}
override def merge(b1: Scholix, b2: Scholix): Scholix = {
if (scholix_complete(b1)) b1 else b2
}
override def finish(reduction: Scholix): Scholix = reduction
override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
}
def createInverseScholixRelation(scholix: Scholix): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
@ -138,16 +144,19 @@ object ScholixUtils extends Serializable {
s.setRelationship(inverseRelationShip(scholix.getRelationship))
s.setSource(scholix.getTarget)
s.setTarget(scholix.getSource)
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map {
d => new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
}(collection.breakOut)
l
} else List()
@ -155,8 +164,11 @@ object ScholixUtils extends Serializable {
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map {
d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava)
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
new ScholixEntityId(
d.getDatasourceName,
List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
)
}(collection.breakOut)
l
} else List()
@ -165,17 +177,16 @@ object ScholixUtils extends Serializable {
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map {
c =>
new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava)
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
new ScholixEntityId(
c.getValue,
List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
)
}.toList
l
} else List()
}
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
@ -184,11 +195,14 @@ object ScholixUtils extends Serializable {
s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource)
s.setTarget(generateScholixResourceFromSummary(target))
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
@ -197,11 +211,14 @@ object ScholixUtils extends Serializable {
s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource)
s.setTarget(target)
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
val r = new ScholixResource
r.setIdentifier(summaryObject.getLocalIdentifier)
@ -214,7 +231,8 @@ object ScholixUtils extends Serializable {
r.setTitle(summaryObject.getTitle.get(0))
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
val l: List[ScholixEntityId] =
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
if (l.nonEmpty)
r.setCreator(l.asJava)
}
@ -222,20 +240,27 @@ object ScholixUtils extends Serializable {
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
r.setPublicationDate(summaryObject.getDate.get(0))
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
val plist: List[ScholixEntityId] =
summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
if (plist.nonEmpty)
r.setPublisher(plist.asJava)
}
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom(
new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava)
, "collected", "complete"
)).toList
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
.map(c =>
new ScholixCollectedFrom(
new ScholixEntityId(
c.getDatasourceName,
List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
),
"collected",
"complete"
)
)
.toList
if (l.nonEmpty)
r.setCollectedFrom(l.asJava)
@ -244,9 +269,7 @@ object ScholixUtils extends Serializable {
r
}
def scholixFromSource(relation: Relation, source: ScholixResource):Scholix = {
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
if (relation == null || source == null)
return null
val s = new Scholix
@ -262,7 +285,6 @@ object ScholixUtils extends Serializable {
s.setPublicationDate(d)
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
s.setPublisher(source.getPublisher)
}
@ -270,13 +292,14 @@ object ScholixUtils extends Serializable {
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setSource(source)
s
}
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
if (relation == null || source == null)
@ -298,12 +321,10 @@ object ScholixUtils extends Serializable {
s.setPublicationDate(d)
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
val l: List[ScholixEntityId] = source.getPublisher.asScala
.map {
p =>
new ScholixEntityId(p, null)
.map { p =>
new ScholixEntityId(p, null)
}(collection.breakOut)
if (l.nonEmpty)
@ -313,31 +334,37 @@ object ScholixUtils extends Serializable {
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setSource(generateScholixResourceFromSummary(source))
s
}
def findURLForPID(
pidValue: List[StructuredProperty],
urls: List[String]
): List[(StructuredProperty, String)] = {
pidValue.map { p =>
val pv = p.getValue
def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = {
pidValue.map {
p =>
val pv = p.getValue
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
(p, r.orNull)
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
(p, r.orNull)
}
}
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
if (r.getInstance() == null || r.getInstance().isEmpty)
return List()
r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
r.getInstance()
.asScala
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
.filter(i => i.getPid != null && i.getUrl != null)
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
.distinct
.toList
}
def resultToSummary(r: Result): ScholixSummary = {
@ -371,7 +398,12 @@ object ScholixUtils extends Serializable {
s.setAuthor(authors.asJava)
}
if (r.getInstance() != null) {
val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue).toList
val dt: List[String] = r
.getInstance()
.asScala
.filter(i => i.getDateofacceptance != null)
.map(i => i.getDateofacceptance.getValue)
.toList
if (dt.nonEmpty)
s.setDate(dt.distinct.asJava)
}
@ -382,7 +414,9 @@ object ScholixUtils extends Serializable {
}
if (r.getSubject != null && !r.getSubject.isEmpty) {
val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).toList
val subjects: List[SchemeValue] = r.getSubject.asScala
.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
.toList
if (subjects.nonEmpty)
s.setSubject(subjects.asJava)
}
@ -391,7 +425,9 @@ object ScholixUtils extends Serializable {
s.setPublisher(List(r.getPublisher.getValue).asJava)
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete")).toList
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
.toList
if (cf.nonEmpty)
s.setDatasources(cf.distinct.asJava)
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.dhp.common;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class MdStoreClientTest {
@Test
public void testMongoCollection() throws IOException {
final MdstoreClient client = new MdstoreClient("mongodb://localhost:27017", "mdstore");
final ObjectMapper mapper = new ObjectMapper();
final List<MDStoreInfo> infos = client.mdStoreWithTimestamp("ODF", "store", "cleaned");
infos.forEach(System.out::println);
final String s = mapper.writeValueAsString(infos);
Path fileName = Paths.get("/Users/sandro/mdstore_info.json");
// Writing into the file
Files.write(fileName, s.getBytes(StandardCharsets.UTF_8));
}
}

View File

@ -33,7 +33,7 @@ class ZenodoAPIClientTest {
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz", file.length()));
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
@ -56,7 +56,7 @@ class ZenodoAPIClientTest {
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz", file.length()));
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
@ -80,7 +80,7 @@ class ZenodoAPIClientTest {
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition", file.length()));
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
Assertions.assertEquals(202, client.publish());
@ -100,7 +100,7 @@ class ZenodoAPIClientTest {
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition", file.length()));
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
Assertions.assertEquals(202, client.publish());

View File

@ -1,100 +0,0 @@
package eu.dnetlib.dhp.oa.merge;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
class AuthorMergerTest {
private String publicationsBasePath;
private List<List<Author>> authors;
@BeforeEach
public void setUp() throws Exception {
publicationsBasePath = Paths
.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
.toFile()
.getAbsolutePath();
authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
.stream()
.map(p -> p._2().getAuthor())
.collect(Collectors.toList());
}
@Test
void mergeTest() { // used in the dedup: threshold set to 0.95
for (List<Author> authors1 : authors) {
System.out.println("List " + (authors.indexOf(authors1) + 1));
for (Author author : authors1) {
System.out.println(authorToString(author));
}
}
List<Author> merge = AuthorMerger.merge(authors);
System.out.println("Merge ");
for (Author author : merge) {
System.out.println(authorToString(author));
}
Assertions.assertEquals(7, merge.size());
}
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
List<Tuple2<String, T>> res = new ArrayList<>();
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(path));
String line = reader.readLine();
while (line != null) {
res
.add(
new Tuple2<>(
MapDocumentUtil.getJPathString("$.id", line),
new ObjectMapper().readValue(line, clazz)));
// read next line
line = reader.readLine();
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return res;
}
public String authorToString(Author a) {
String print = "Fullname = ";
print += a.getFullname() + " pid = [";
if (a.getPid() != null)
for (StructuredProperty sp : a.getPid()) {
print += sp.toComparableString() + " ";
}
print += "]";
return print;
}
}

View File

@ -44,105 +44,104 @@ class OafMapperUtilsTest {
@Test
void testDateValidation() {
assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate(" 2016-04-05").isPresent());
assertNotNull(GraphCleaningFunctions.cleanDate("2016-05-07T12:41:19.202Z "));
assertNotNull(GraphCleaningFunctions.cleanDate("2020-09-10 11:08:52 "));
assertNotNull(GraphCleaningFunctions.cleanDate(" 2016-04-05"));
assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get());
assertEquals("2016-04-05", GraphCleaningFunctions.cleanDate("2016 Apr 05"));
assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 MST 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get());
assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get());
assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get());
assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get());
assertEquals("2009-05-08", GraphCleaningFunctions.cleanDate("May 8, 2009 5:57:51 PM"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, 1970"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, '70"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 1970"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 70"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 2 15:04:05 2006"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 2 15:04:05 MST 2006"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 02 15:04:05 -0700 2006"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Monday, 02-Jan-06 15:04:05 MST"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 MST"));
assertEquals("2017-07-11", GraphCleaningFunctions.cleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 -0700"));
assertEquals("2018-01-04", GraphCleaningFunctions.cleanDate("Thu, 4 Jan 2018 17:53:36 +0000"));
assertEquals("2015-08-10", GraphCleaningFunctions.cleanDate("Mon Aug 10 15:44:11 UTC+0100 2015"));
assertEquals(
"2015-07-03",
GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get());
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get());
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get());
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get());
assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get());
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get());
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get());
assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get());
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get());
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get());
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get());
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get());
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get());
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get());
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get());
assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get());
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get());
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get());
GraphCleaningFunctions.cleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)"));
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 10:09am"));
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 at 10:09am PST-08"));
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012, 10:10:09"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7, 1970"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7th, 1970"));
assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006, 19:17"));
assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006 19:17"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 70"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 1970"));
assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("03 February 2013"));
assertEquals("2013-07-01", GraphCleaningFunctions.cleanDate("1 July 2013"));
assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("2013-Feb-03"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3/31/2014"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03/31/2014"));
assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08/21/71"));
assertEquals("1971-01-08", GraphCleaningFunctions.cleanDate("8/1/71"));
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/2014 22:05"));
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("04/08/2014 22:05"));
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/14 22:05"));
assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("04/2/2014 03:00:51"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00:00 AM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00:01 PM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00 PM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 1:00 PM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00 AM"));
assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("4/02/2014 03:00:51"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59.3186369"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/3/31"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/03/31"));
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/4/8 22:05"));
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/04/08 22:05"));
assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/04/2 03:00:51"));
assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/4/02 03:00:51"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59.3186369"));
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014年04月08日"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("2006-01-02T15:04:05+0000"));
assertEquals("2009-08-13", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09-07:00"));
assertEquals("2009-08-12", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.3186369"));
assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.123"));
assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43"));
assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43:22"));
assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 UTC"));
assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 GMT"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 05:24:37 PM"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800 +08"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:44 +09:00"));
assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000 +0000 UTC"));
assertEquals("2015-09-30", GraphCleaningFunctions.cleanDate("2015-09-30 18:48:56.35272715 +0000 UTC"));
assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 GMT"));
assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 UTC"));
assertEquals(
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get());
"2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001"));
assertEquals(
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get());
assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get());
assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get());
assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get());
assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get());
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get());
assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get());
assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get());
assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get());
assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get());
"2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001"));
assertEquals("2017-07-19", GraphCleaningFunctions.cleanDate("2017-07-19 03:21:51+00:00"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26"));
assertEquals("2014-04-01", GraphCleaningFunctions.cleanDate("2014-04"));
assertEquals("2014-01-01", GraphCleaningFunctions.cleanDate("2014"));
assertEquals("2014-05-11", GraphCleaningFunctions.cleanDate("2014-05-11 08:20:13,787"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3.31.2014"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03.31.2014"));
assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08.21.71"));
assertEquals("2014-03-01", GraphCleaningFunctions.cleanDate("2014.03"));
assertEquals("2014-03-30", GraphCleaningFunctions.cleanDate("2014.03.30"));
assertEquals("2014-06-01", GraphCleaningFunctions.cleanDate("20140601"));
assertEquals("2014-07-22", GraphCleaningFunctions.cleanDate("20140722105203"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("1332151919"));
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367189"));
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222"));
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222333"));
}
@ -185,6 +184,22 @@ class OafMapperUtilsTest {
.getClassid());
}
@Test
void testDelegatedAuthority() throws IOException {
Dataset d1 = read("dataset_2.json", Dataset.class);
Dataset d2 = read("dataset_delegated.json", Dataset.class);
assertEquals(1, d2.getCollectedfrom().size());
assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID));
Result res = OafMapperUtils.mergeResults(d1, d2);
assertEquals(d2, res);
System.out.println(OBJECT_MAPPER.writeValueAsString(res));
}
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
}

File diff suppressed because one or more lines are too long

View File

@ -1 +1,140 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
"resuttype": {"classid": "dataset"},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2011.03.013"
},
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
],
"collectedfrom": [
{
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
"value": "Repository B"
}
],
"instance": [
{
"refereed": {
"classid": "0000",
"classname": "UNKNOWN",
"schemeid": "dnet:review_levels",
"schemename": "dnet:review_levels"
},
"hostedby": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"processingchargecurrency": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "EUR"
},
"pid": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "Digital Object Identifier",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1371/journal.pone.0085605"
}
],
"distributionlocation": "",
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
"alternateIdentifier": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "pmid",
"classname": "PubMed ID",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "24454899.0"
}
],
"collectedfrom": {
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
"value": "Repository B"
},
"processingchargeamount": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "1022.02"
},
"instancetype": {
"classid": "0004",
"classname": "Conference object",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
}
}
]
}

View File

@ -0,0 +1,140 @@
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
"resuttype": {"classid": "dataset"},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2011.03.013"
},
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
],
"collectedfrom": [
{
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
}
],
"instance": [
{
"refereed": {
"classid": "0000",
"classname": "UNKNOWN",
"schemeid": "dnet:review_levels",
"schemename": "dnet:review_levels"
},
"hostedby": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"processingchargecurrency": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "EUR"
},
"pid": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "Digital Object Identifier",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1371/journal.pone.0085605"
}
],
"distributionlocation": "",
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
"alternateIdentifier": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "pmid",
"classname": "PubMed ID",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "24454899.0"
}
],
"collectedfrom": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"processingchargeamount": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "1022.02"
},
"instancetype": {
"classid": "0004",
"classname": "Conference object",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
}
}
]
}

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-actionmanager</artifactId>

View File

@ -107,7 +107,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=2560
--conf spark.sql.shuffle.partitions=7000
</spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
@ -159,7 +159,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=2560
--conf spark.sql.shuffle.partitions=7000
</spark-opts>
<arg>--inputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>

View File

@ -107,7 +107,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.sql.shuffle.partitions=7000
</spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -159,7 +159,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.sql.shuffle.partitions=7000
</spark-opts>
<arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>

View File

@ -99,7 +99,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.sql.shuffle.partitions=10000
</spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>

View File

@ -172,6 +172,61 @@ public class PromoteActionPayloadForGraphTableJobTest {
}
}
@Test
void shouldPromoteActionPayload_custom() throws Exception {
Class<? extends Oaf> rowClazz = Publication.class;
Class<? extends Oaf> actionPayloadClazz = Result.class;
MergeAndGet.Strategy strategy = MergeAndGet.Strategy.MERGE_FROM_AND_GET;
// given
Path inputGraphTableDir = createGraphTable(inputGraphRootDir, rowClazz);
Path inputActionPayloadDir = createActionPayload(inputActionPayloadRootDir, rowClazz, actionPayloadClazz);
Path outputGraphTableDir = outputDir.resolve("graph").resolve(rowClazz.getSimpleName().toLowerCase());
// when
PromoteActionPayloadForGraphTableJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputGraphTablePath",
inputGraphTableDir.toString(),
"-graphTableClassName",
rowClazz.getCanonicalName(),
"-inputActionPayloadPath",
inputActionPayloadDir.toString(),
"-actionPayloadClassName",
actionPayloadClazz.getCanonicalName(),
"-outputGraphTablePath",
outputGraphTableDir.toString(),
"-mergeAndGetStrategy",
strategy.name(),
"--shouldGroupById",
"true"
});
// then
assertTrue(Files.exists(outputGraphTableDir));
List<? extends Oaf> actualOutputRows = readGraphTableFromJobOutput(outputGraphTableDir.toString(), rowClazz)
.collectAsList()
.stream()
.sorted(Comparator.comparingInt(Object::hashCode))
.collect(Collectors.toList());
Publication p = actualOutputRows
.stream()
.map(o -> (Publication) o)
.filter(o -> "50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879".equals(o.getId()))
.findFirst()
.get();
assertNotNull(p.getMeasures());
assertTrue(p.getMeasures().size() > 0);
}
public static Stream<Arguments> promoteJobTestParams() {
return Stream
.of(

View File

@ -17,4 +17,5 @@
{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018243405,"id":"50|CSC_________::00019460865d6cc381b36076131a5bc1","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"Computer Science::Networking and Internet Architecture","qualifier":{"classid":"arxiv","classname":"arxiv","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7416","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018243405,"id":"50|CSC_________::00019460865d6cc381b36076131a5bc1","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"Computer Science::Networking and Internet Architecture","qualifier":{"classid":"arxiv","classname":"arxiv","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7416","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"processingchargeamount":null,"processingchargecurrency":null,"measures":[{"id":"influence","unit":[{"key":"score","value":"1.64385446761e-08","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity_alt","unit":[{"key":"score","value":"18.9590813696","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity","unit":[{"key":"score","value":"6.00577981643e-08","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]}],"author":null,"resulttype":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":null}

View File

@ -0,0 +1,21 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-aggregation</artifactId>
<build>

View File

@ -13,19 +13,24 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class Constants {
public static final String DOI = "doi";
public static final String DOI_CLASSNAME = "Digital Object Identifier";
public static final String DEFAULT_DELIMITER = ",";
public static final String DEFAULT_FOS_DELIMITER = "\t";
public static final String UPDATE_DATA_INFO_TYPE = "update";
public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
public static final String UPDATE_SUBJECT_SDG_CLASS_ID = "subject:sdg";
public static final String UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID = "measure:usage_counts";
public static final String UPDATE_KEY_USAGE_COUNTS = "count";
public static final String FOS_CLASS_ID = "FOS";
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
@ -55,13 +60,13 @@ public class Constants {
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static StructuredProperty getSubject(String sbj, String classid, String classname,
public static Subject getSubject(String sbj, String classid, String classname,
String diqualifierclassid) {
if (sbj.equals(NULL))
if (sbj == null || sbj.equals(NULL))
return null;
StructuredProperty sp = new StructuredProperty();
sp.setValue(sbj);
sp
Subject s = new Subject();
s.setValue(sbj);
s
.setQualifier(
OafMapperUtils
.qualifier(
@ -69,7 +74,7 @@ public class Constants {
classname,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
sp
s
.setDataInfo(
OafMapperUtils
.dataInfo(
@ -85,7 +90,7 @@ public class Constants {
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return sp;
return s;
}
}

View File

@ -1,7 +1,7 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_FOS_DELIMITER;
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -9,8 +9,6 @@ import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
@ -49,7 +47,7 @@ public class GetFOSSparkJob implements Serializable {
final String delimiter = Optional
.ofNullable(parser.get("delimiter"))
.orElse(DEFAULT_DELIMITER);
.orElse(DEFAULT_FOS_DELIMITER);
SparkConf sconf = new SparkConf();
runWithSparkSession(

View File

@ -33,6 +33,7 @@ import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Measure;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
@ -95,11 +96,39 @@ public class PrepareBipFinder implements Serializable {
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class))
.map((MapFunction<BipScore, Result>) v -> {
Result r = new Result();
final String cleanedPid = CleaningFunctions.normalizePidValue(DOI, v.getId());
r.setId(DHPUtils.generateUnresolvedIdentifier(v.getId(), DOI));
Instance inst = new Instance();
inst.setMeasures(getMeasure(v));
inst
.setPid(
Arrays
.asList(
OafMapperUtils
.structuredProperty(
cleanedPid,
OafMapperUtils
.qualifier(
DOI, DOI_CLASSNAME,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES),
null)));
r.setInstance(Arrays.asList(inst));
r
.setDataInfo(
OafMapperUtils
.dataInfo(
false, null, true,
false,
OafMapperUtils
.qualifier(
ModelConstants.PROVENANCE_ENRICH,
null,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
null));
return r;
}, Encoders.bean(Result.class))
.write()

View File

@ -21,8 +21,11 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareFOSSparkJob implements Serializable {
@ -71,16 +74,30 @@ public class PrepareFOSSparkJob implements Serializable {
Result r = new Result();
FOSDataModel first = it.next();
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
HashSet<String> level1 = new HashSet<>();
HashSet<String> level2 = new HashSet<>();
HashSet<String> level3 = new HashSet<>();
addLevels(level1, level2, level3, first);
it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
List<StructuredProperty> sbjs = new ArrayList<>();
List<Subject> sbjs = new ArrayList<>();
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
r.setSubject(sbjs);
r
.setDataInfo(
OafMapperUtils
.dataInfo(
false, null, true,
false,
OafMapperUtils
.qualifier(
ModelConstants.PROVENANCE_ENRICH,
null,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
null));
return r;
}, Encoders.bean(Result.class))
.write()

View File

@ -21,8 +21,11 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareSDGSparkJob implements Serializable {
@ -71,13 +74,26 @@ public class PrepareSDGSparkJob implements Serializable {
Result r = new Result();
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
SDGDataModel first = it.next();
List<StructuredProperty> sbjs = new ArrayList<>();
List<Subject> sbjs = new ArrayList<>();
sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
it
.forEachRemaining(
s -> sbjs
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
r.setSubject(sbjs);
r
.setDataInfo(
OafMapperUtils
.dataInfo(
false, null, true,
false,
OafMapperUtils
.qualifier(
ModelConstants.PROVENANCE_ENRICH,
null,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
null));
return r;
}, Encoders.bean(Result.class))
.write()

View File

@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
@ -67,7 +68,19 @@ public class SparkSaveUnresolved implements Serializable {
.groupByKey((MapFunction<Result, String>) Result::getId, Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
Result ret = it.next();
it.forEachRemaining(r -> ret.mergeFrom(r));
it.forEachRemaining(r -> {
if (r.getInstance() != null) {
ret.setInstance(r.getInstance());
}
if (r.getSubject() != null) {
if (ret.getSubject() != null)
ret.getSubject().addAll(r.getSubject());
else
ret.setSubject(r.getSubject());
}
// ret.mergeFrom(r)
});
return ret;
}, Encoders.bean(Result.class))
.write()

View File

@ -14,6 +14,7 @@ import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
@ -21,6 +22,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -83,10 +85,13 @@ public class CreateActionSetSparkJob implements Serializable {
private static void extractContent(SparkSession spark, String inputPath, String outputPath,
boolean shouldDuplicateRels) {
spark
.sqlContext()
.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
.read()
.textFile(inputPath + "/*")
.map(
(MapFunction<String, COCI>) value -> OBJECT_MAPPER.readValue(value, COCI.class),
Encoders.bean(COCI.class))
.flatMap(
(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
(FlatMapFunction<COCI, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
Encoders.bean(Relation.class))
.filter((FilterFunction<Relation>) value -> value != null)
.toJavaRDD()
@ -98,26 +103,30 @@ public class CreateActionSetSparkJob implements Serializable {
}
private static List<Relation> createRelation(String value, boolean duplicate) {
String[] line = value.split(",");
if (!line[1].startsWith("10.")) {
return new ArrayList<>();
}
private static List<Relation> createRelation(COCI value, boolean duplicate) {
List<Relation> relationList = new ArrayList<>();
String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1]));
final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2]));
String citing = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting()));
final String cited = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));
relationList
.addAll(
getRelations(
citing,
cited));
if (!citing.equals(cited)) {
relationList
.add(
getRelation(
citing,
cited, ModelConstants.CITES));
if (duplicate && line[1].endsWith(".refs")) {
citing = ID_PREFIX + IdentifierFactory
.md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs"))));
relationList.addAll(getRelations(citing, cited));
if (duplicate && value.getCiting().endsWith(".refs")) {
citing = ID_PREFIX + IdentifierFactory
.md5(
CleaningFunctions
.normalizePidValue(
"doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs"))));
relationList.add(getRelation(citing, cited, ModelConstants.CITES));
}
}
return relationList;

View File

@ -0,0 +1,103 @@
package eu.dnetlib.dhp.actionmanager.opencitations;
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class ReadCOCI implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ReadCOCI.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
ReadCOCI.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String[] inputFile = parser.get("inputFile").split(";");
log.info("inputFile {}", inputFile.toString());
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath {}", workingPath);
SparkConf sconf = new SparkConf();
final String delimiter = Optional
.ofNullable(parser.get("delimiter"))
.orElse(DEFAULT_DELIMITER);
runWithSparkSession(
sconf,
isSparkSessionManaged,
spark -> {
doRead(
spark,
workingPath,
inputFile,
outputPath,
delimiter);
});
}
private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
String outputPath,
String delimiter) throws IOException {
for (String inputFile : inputFiles) {
String p_string = workingPath + "/" + inputFile + ".gz";
Dataset<Row> cociData = spark
.read()
.format("csv")
.option("sep", delimiter)
.option("inferSchema", "true")
.option("header", "true")
.option("quotes", "\"")
.load(p_string)
.repartition(100);
cociData.map((MapFunction<Row, COCI>) row -> {
COCI coci = new COCI();
coci.setOci(row.getString(0));
coci.setCiting(row.getString(1));
coci.setCited(row.getString(2));
return coci;
}, Encoders.bean(COCI.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + inputFile);
}
}
}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.dhp.actionmanager.opencitations.model;
import java.io.Serializable;
import com.opencsv.bean.CsvBindByPosition;
public class COCI implements Serializable {
private String oci;
private String citing;
private String cited;
public String getOci() {
return oci;
}
public void setOci(String oci) {
this.oci = oci;
}
public String getCiting() {
return citing;
}
public void setCiting(String citing) {
this.citing = citing;
}
public String getCited() {
return cited;
}
public void setCited(String cited) {
this.cited = cited;
}
}

View File

@ -266,11 +266,15 @@ public class PrepareProgramme {
String code = csvProgramme.getCode();
if (!code.endsWith(".") && !code.contains("Euratom")
&& !code.equals("H2020-EC"))
&& !code.equals("H2020-EC") && !code.equals("H2020") &&
!code.equals("H2020-Topics"))
code += ".";
csvProgramme.setClassification(map.get(code)._1());
csvProgramme.setClassification_short(map.get(code)._2());
if (map.containsKey(code)) {
csvProgramme.setClassification(map.get(code)._1());
csvProgramme.setClassification_short(map.get(code)._2());
} else
log.info("WARNING: No entry in map for code " + code);
return csvProgramme;
}).collect();

View File

@ -3,12 +3,23 @@ package eu.dnetlib.dhp.actionmanager.project;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.util.*;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
@ -19,6 +30,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject;
import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import scala.Tuple2;
@ -54,6 +66,9 @@ public class PrepareProjects {
final String projectPath = parser.get("projectPath");
log.info("projectPath {}: ", projectPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath {}: ", workingPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
@ -76,7 +91,7 @@ public class PrepareProjects {
}
private static void exec(SparkSession spark, String projectPath, String dbProjectPath, String outputPath) {
Dataset<CSVProject> project = readPath(spark, projectPath, CSVProject.class);
Dataset<Project> project = readPath(spark, projectPath, Project.class);
Dataset<ProjectSubset> dbProjects = readPath(spark, dbProjectPath, ProjectSubset.class);
dbProjects
@ -90,14 +105,14 @@ public class PrepareProjects {
}
private static FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
private static FlatMapFunction<Tuple2<ProjectSubset, Project>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
return value -> {
Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
List<CSVProject> csvProjectList = new ArrayList<>();
if (csvProject.isPresent()) {
if (Optional.ofNullable(value._2()).isPresent()) {
Project project = value._2();
String[] programme = csvProject.get().getProgramme().split(";");
String topic = csvProject.get().getTopics();
String[] programme = project.getLegalBasis().split(";");
String topic = project.getTopics();
Arrays
.stream(programme)
@ -106,7 +121,7 @@ public class PrepareProjects {
proj.setTopics(topic);
proj.setProgramme(p);
proj.setId(csvProject.get().getId());
proj.setId(project.getId());
csvProjectList.add(proj);
});
}

View File

@ -24,6 +24,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject;
import eu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic;
import eu.dnetlib.dhp.actionmanager.project.utils.model.JsonTopic;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
@ -110,7 +111,7 @@ public class SparkAtomicActionJob {
Dataset<CSVProject> project = readPath(spark, projectPatH, CSVProject.class);
Dataset<CSVProgramme> programme = readPath(spark, programmePath, CSVProgramme.class);
Dataset<EXCELTopic> topic = readPath(spark, topicPath, EXCELTopic.class);
Dataset<JsonTopic> topic = readPath(spark, topicPath, JsonTopic.class);
Dataset<Project> aaproject = project
.joinWith(programme, project.col("programme").equalTo(programme.col("code")), "left")
@ -124,9 +125,7 @@ public class SparkAtomicActionJob {
Project pp = new Project();
pp
.setId(
createOpenaireId(
ModelSupport.entityIdPrefix.get("project"),
"corda__h2020", csvProject.getId()));
csvProject.getId());
pp.setH2020topiccode(csvProject.getTopics());
H2020Programme pm = new H2020Programme();
H2020Classification h2020classification = new H2020Classification();
@ -144,10 +143,15 @@ public class SparkAtomicActionJob {
.filter(Objects::nonNull);
aaproject
.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")), "left")
.map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> {
Optional<EXCELTopic> op = Optional.ofNullable(p._2());
.joinWith(topic, aaproject.col("id").equalTo(topic.col("projectID")), "left")
.map((MapFunction<Tuple2<Project, JsonTopic>, Project>) p -> {
Optional<JsonTopic> op = Optional.ofNullable(p._2());
Project rp = p._1();
rp
.setId(
createOpenaireId(
ModelSupport.entityIdPrefix.get("project"),
"corda__h2020", rp.getId()));
op.ifPresent(excelTopic -> rp.setH2020topicdescription(excelTopic.getTitle()));
return rp;
}, Encoders.bean(Project.class))

View File

@ -22,6 +22,7 @@ import eu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic;
/**
* Reads a generic excel file and maps it into classes that mirror its schema
*/
@Deprecated
public class EXCELParser {
public <R> List<R> parse(InputStream file, String classForName, String sheetName)

View File

@ -0,0 +1,101 @@
package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
/**
* @author miriam.baglioni
* @Date 28/02/23
*/
public class ExtractFromZip implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareProjects.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareProjects.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/project/extract_fromzip_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fs = FileSystem.get(conf);
doExtract(inputPath, outputPath, fs);
}
private static void doExtract(String inputFile, String workingPath, FileSystem fileSystem)
throws IOException {
final Path path = new Path(inputFile);
FSDataInputStream project_zip = fileSystem.open(path);
try (ZipInputStream zis = new ZipInputStream(project_zip)) {
ZipEntry entry = null;
while ((entry = zis.getNextEntry()) != null) {
if (!entry.isDirectory()) {
String fileName = entry.getName();
byte buffer[] = new byte[1024];
int count;
try (
FSDataOutputStream out = fileSystem
.create(new Path(workingPath + fileName))) {
while ((count = zis.read(buffer, 0, buffer.length)) != -1)
out.write(buffer, 0, count);
}
}
}
}
}
}

View File

@ -6,7 +6,9 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.GetCSV;
@ -40,8 +42,11 @@ public class ReadCSV {
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
FSDataInputStream inputStream = fileSystem.open(new Path(fileURL));
BufferedReader reader = new BufferedReader(
new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL)));
new InputStreamReader(inputStream));
GetCSV.getCsv(fileSystem, reader, hdfsPath, classForName, del);

View File

@ -0,0 +1,90 @@
package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
/**
* @author miriam.baglioni
* @Date 28/02/23
*/
public class ReadProjects implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ReadProjects.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareProjects.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/project/read_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fs = FileSystem.get(conf);
readProjects(inputPath, outputPath, fs);
}
public static void readProjects(String inputFile, String workingPath, FileSystem fs) throws IOException {
Path hdfsreadpath = new Path(inputFile);
FSDataInputStream inputStream = fs.open(hdfsreadpath);
ArrayList<Project> projects = OBJECT_MAPPER
.readValue(
IOUtils.toString(inputStream, "UTF-8"),
new TypeReference<List<Project>>() {
});
Path hdfsWritePath = new Path(workingPath);
if (fs.exists(hdfsWritePath)) {
fs.delete(hdfsWritePath, false);
}
FSDataOutputStream fos = fs.create(hdfsWritePath);
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
for (Project p : projects) {
writer.write(OBJECT_MAPPER.writeValueAsString(p));
writer.newLine();
}
}
}
}

View File

@ -0,0 +1,92 @@
package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
import eu.dnetlib.dhp.actionmanager.project.utils.model.JsonTopic;
import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
/**
* @author miriam.baglioni
* @Date 28/02/23
*/
public class ReadTopics implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ReadTopics.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareProjects.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/project/read_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fs = FileSystem.get(conf);
readTopics(inputPath, outputPath, fs);
}
public static void readTopics(String inputFile, String workingPath, FileSystem fs) throws IOException {
Path hdfsreadpath = new Path(inputFile);
FSDataInputStream inputStream = fs.open(hdfsreadpath);
ArrayList<JsonTopic> topics = OBJECT_MAPPER
.readValue(
IOUtils.toString(inputStream, "UTF-8"),
new TypeReference<List<JsonTopic>>() {
});
Path hdfsWritePath = new Path(workingPath);
if (fs.exists(hdfsWritePath)) {
fs.delete(hdfsWritePath, false);
}
FSDataOutputStream fos = fs.create(hdfsWritePath);
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
for (JsonTopic p : topics) {
writer.write(OBJECT_MAPPER.writeValueAsString(p));
writer.newLine();
}
}
}
}

View File

@ -13,7 +13,7 @@ public class CSVProject implements Serializable {
@CsvBindByName(column = "id")
private String id;
@CsvBindByName(column = "programme")
@CsvBindByName(column = "legalBasis")
private String programme;
@CsvBindByName(column = "topics")

View File

@ -6,6 +6,7 @@ import java.io.Serializable;
/**
* the model class for the topic excel file
*/
@Deprecated
public class EXCELTopic implements Serializable {
private String rcn;
private String language;
@ -17,9 +18,27 @@ public class EXCELTopic implements Serializable {
private String title;
private String shortTitle;
private String objective;
private String subjects;
private String keywords;
private String legalBasis;
private String call;
private String id;
private String contentUpdateDate;
public String getContentUpdateDate() {
return contentUpdateDate;
}
public void setContentUpdateDate(String contentUpdateDate) {
this.contentUpdateDate = contentUpdateDate;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getRcn() {
return rcn;
@ -101,12 +120,12 @@ public class EXCELTopic implements Serializable {
this.objective = objective;
}
public String getSubjects() {
return subjects;
public String getKeywords() {
return keywords;
}
public void setSubjects(String subjects) {
this.subjects = subjects;
public void setKeywords(String keywords) {
this.keywords = keywords;
}
public String getLegalBasis() {

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.actionmanager.project.utils.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 28/02/23
*/
public class JsonTopic implements Serializable {
private String projectID;
private String title;
private String topic;
public String getProjectID() {
return projectID;
}
public void setProjectID(String projectID) {
this.projectID = projectID;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getTopic() {
return topic;
}
public void setTopic(String topic) {
this.topic = topic;
}
}

View File

@ -0,0 +1,191 @@
package eu.dnetlib.dhp.actionmanager.project.utils.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 24/02/23
*/
public class Project implements Serializable {
private String acronym;
private String contentUpdateDate;
private String ecMaxContribution;
private String ecSignatureDate;
private String endDate;
private String frameworkProgramme;
private String fundingScheme;
private String grantDoi;
private String id;
private String legalBasis;
private String masterCall;
private String nature;
private String objective;
private String rcn;
private String startDate;
private String status;
private String subCall;
private String title;
private String topics;
private String totalCost;
public String getAcronym() {
return acronym;
}
public void setAcronym(String acronym) {
this.acronym = acronym;
}
public String getContentUpdateDate() {
return contentUpdateDate;
}
public void setContentUpdateDate(String contentUpdateDate) {
this.contentUpdateDate = contentUpdateDate;
}
public String getEcMaxContribution() {
return ecMaxContribution;
}
public void setEcMaxContribution(String ecMaxContribution) {
this.ecMaxContribution = ecMaxContribution;
}
public String getEcSignatureDate() {
return ecSignatureDate;
}
public void setEcSignatureDate(String ecSignatureDate) {
this.ecSignatureDate = ecSignatureDate;
}
public String getEndDate() {
return endDate;
}
public void setEndDate(String endDate) {
this.endDate = endDate;
}
public String getFrameworkProgramme() {
return frameworkProgramme;
}
public void setFrameworkProgramme(String frameworkProgramme) {
this.frameworkProgramme = frameworkProgramme;
}
public String getFundingScheme() {
return fundingScheme;
}
public void setFundingScheme(String fundingScheme) {
this.fundingScheme = fundingScheme;
}
public String getGrantDoi() {
return grantDoi;
}
public void setGrantDoi(String grantDoi) {
this.grantDoi = grantDoi;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getLegalBasis() {
return legalBasis;
}
public void setLegalBasis(String legalBasis) {
this.legalBasis = legalBasis;
}
public String getMasterCall() {
return masterCall;
}
public void setMasterCall(String masterCall) {
this.masterCall = masterCall;
}
public String getNature() {
return nature;
}
public void setNature(String nature) {
this.nature = nature;
}
public String getObjective() {
return objective;
}
public void setObjective(String objective) {
this.objective = objective;
}
public String getRcn() {
return rcn;
}
public void setRcn(String rcn) {
this.rcn = rcn;
}
public String getStartDate() {
return startDate;
}
public void setStartDate(String startDate) {
this.startDate = startDate;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getSubCall() {
return subCall;
}
public void setSubCall(String subCall) {
this.subCall = subCall;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getTopics() {
return topics;
}
public void setTopics(String topics) {
this.topics = topics;
}
public String getTotalCost() {
return totalCost;
}
public void setTotalCost(String totalCost) {
this.totalCost = totalCost;
}
}

View File

@ -29,8 +29,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
@ -38,16 +37,17 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType;
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
@ -60,10 +60,8 @@ public class GenerateRorActionSetJob {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ROR_NS_PREFIX = "ror_________";
private static final List<KeyValue> ROR_COLLECTED_FROM = listKeyValues(
"10|openaire____::993a7ae7a863813cf95028b50708e222", "ROR");
Constants.ROR_OPENAIRE_ID, Constants.ROR_DATASOURCE_NAME);
private static final DataInfo ROR_DATA_INFO = dataInfo(
false, "", false, false, ENTITYREGISTRY_PROVENANCE_ACTION, "0.92");
@ -112,25 +110,22 @@ public class GenerateRorActionSetJob {
final String outputPath) throws IOException {
readInputPath(spark, inputPath)
.map(
(MapFunction<RorOrganization, Organization>) GenerateRorActionSetJob::convertRorOrg,
Encoders.bean(Organization.class))
.toJavaRDD()
.map(o -> new AtomicAction<>(Organization.class, o))
.map(GenerateRorActionSetJob::convertRorOrg)
.flatMap(List::iterator)
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
protected static Organization convertRorOrg(final RorOrganization r) {
protected static List<AtomicAction<? extends Oaf>> convertRorOrg(final RorOrganization r) {
final Date now = new Date();
final Organization o = new Organization();
o.setId(String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(r.getId())));
o.setOriginalId(Arrays.asList(String.format("%s::%s", ROR_NS_PREFIX, r.getId())));
o.setId(calculateOpenaireId(r.getId()));
o.setOriginalId(Arrays.asList(String.format("%s::%s", Constants.ROR_NS_PREFIX, r.getId())));
o.setCollectedfrom(ROR_COLLECTED_FROM);
o.setPid(pids(r));
o.setDateofcollection(now.toString());
@ -166,7 +161,15 @@ public class GenerateRorActionSetJob {
o.setDataInfo(ROR_DATA_INFO);
o.setLastupdatetimestamp(now.getTime());
return o;
final List<AtomicAction<? extends Oaf>> res = new ArrayList<>();
res.add(new AtomicAction<>(Organization.class, o));
return res;
}
private static String calculateOpenaireId(final String rorId) {
return String.format("20|%s::%s", Constants.ROR_NS_PREFIX, DHPUtils.md5(rorId));
}
private static List<StructuredProperty> pids(final RorOrganization r) {
@ -202,14 +205,14 @@ public class GenerateRorActionSetJob {
.collect(Collectors.toList());
}
private static Dataset<RorOrganization> readInputPath(
private static JavaRDD<RorOrganization> readInputPath(
final SparkSession spark,
final String path) throws IOException {
try (final FileSystem fileSystem = FileSystem.get(new Configuration());
final InputStream is = fileSystem.open(new Path(path))) {
final RorOrganization[] arr = OBJECT_MAPPER.readValue(is, RorOrganization[].class);
return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class));
return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class)).toJavaRDD();
}
}

View File

@ -0,0 +1,186 @@
package eu.dnetlib.dhp.actionmanager.usagestats;
import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2;
/**
* created the Atomic Action for each type of results
*/
public class SparkAtomicActionUsageJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionUsageJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkAtomicActionUsageJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
final String dbname = parser.get("usagestatsdb");
final String workingPath = parser.get("workingPath");
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
prepareData(dbname, spark, workingPath + "/usageDb", "usage_stats", "result_id");
prepareData(dbname, spark, workingPath + "/projectDb", "project_stats", "id");
prepareData(dbname, spark, workingPath + "/datasourceDb", "datasource_stats", "repositor_id");
writeActionSet(spark, workingPath, outputPath);
});
}
private static void prepareData(String dbname, SparkSession spark, String workingPath, String tableName,
String attribute_name) {
spark
.sql(
String
.format(
"select %s as id, sum(downloads) as downloads, sum(views) as views " +
"from %s.%s group by %s",
attribute_name, dbname, tableName, attribute_name))
.as(Encoders.bean(UsageStatsModel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
}
public static void writeActionSet(SparkSession spark, String inputPath, String outputPath) {
getFinalIndicatorsResult(spark, inputPath + "/usageDb")
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.union(
getFinalIndicatorsProject(spark, inputPath + "/projectDb")
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p)))
.union(
getFinalIndicatorsDatasource(spark, inputPath + "/datasourceDb")
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p)))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
private static Dataset<Result> getFinalIndicatorsResult(SparkSession spark, String inputPath) {
return readPath(spark, inputPath, UsageStatsModel.class)
.map((MapFunction<UsageStatsModel, Result>) usm -> {
Result r = new Result();
r.setId("50|" + usm.getId());
r.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
return r;
}, Encoders.bean(Result.class));
}
private static Dataset<Project> getFinalIndicatorsProject(SparkSession spark, String inputPath) {
return readPath(spark, inputPath, UsageStatsModel.class)
.map((MapFunction<UsageStatsModel, Project>) usm -> {
Project p = new Project();
p.setId("40|" + usm.getId());
p.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
return p;
}, Encoders.bean(Project.class));
}
private static Dataset<Datasource> getFinalIndicatorsDatasource(SparkSession spark, String inputPath) {
return readPath(spark, inputPath, UsageStatsModel.class)
.map((MapFunction<UsageStatsModel, Datasource>) usm -> {
Datasource d = new Datasource();
d.setId("10|" + usm.getId());
d.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
return d;
}, Encoders.bean(Datasource.class));
}
private static List<Measure> getMeasure(Long downloads, Long views) {
DataInfo dataInfo = OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"");
return Arrays
.asList(
OafMapperUtils
.newMeasureInstance("downloads", String.valueOf(downloads), UPDATE_KEY_USAGE_COUNTS, dataInfo),
OafMapperUtils.newMeasureInstance("views", String.valueOf(views), UPDATE_KEY_USAGE_COUNTS, dataInfo));
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.actionmanager.usagestats;
import java.io.Serializable;
public class UsageStatsModel implements Serializable {
private String id;
private Long downloads;
private Long views;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Long getDownloads() {
return downloads;
}
public void setDownloads(Long downloads) {
this.downloads = downloads;
}
public Long getViews() {
return views;
}
public void setViews(Long views) {
this.views = views;
}
}

View File

@ -19,6 +19,8 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
@ -114,6 +116,10 @@ public class CollectorWorker extends ReportingJob {
return new OaiCollectorPlugin(clientParams);
case rest_json2xml:
return new RestCollectorPlugin(clientParams);
case file:
return new FileCollectorPlugin(fileSystem);
case fileGzip:
return new FileGZipCollectorPlugin(fileSystem);
case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(api.getParams().get("other_plugin_type"))

View File

@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
public interface CollectorPlugin {
enum NAME {
oai, other, rest_json2xml;
oai, other, rest_json2xml, file, fileGzip;
public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb

View File

@ -0,0 +1,80 @@
package eu.dnetlib.dhp.collection.plugin.file;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.utils.XMLIterator;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin {
private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class);
public static final String SPLIT_ON_ELEMENT = "splitOnElement";
private final FileSystem fileSystem;
public AbstractSplittedRecordPlugin(FileSystem fileSystem) {
this.fileSystem = fileSystem;
}
@Override
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
// get path to file
final Path filePath = Optional
.ofNullable(api.getBaseUrl())
.map(Path::new)
.orElseThrow(() -> new CollectorException("missing baseUrl"));
log.info("baseUrl: {}", filePath);
// check that path to file exists
try {
if (!fileSystem.exists(filePath)) {
throw new CollectorException("path does not exist: " + filePath);
}
} catch (IOException e) {
throw new CollectorException(e);
}
// get split element
final String splitOnElement = Optional
.ofNullable(api.getParams().get(SPLIT_ON_ELEMENT))
.orElseThrow(
() -> new CollectorException(String
.format("missing parameter '%s', required by the AbstractSplittedRecordPlugin", SPLIT_ON_ELEMENT)));
log.info("splitOnElement: {}", splitOnElement);
final BufferedInputStream bis = getBufferedInputStream(filePath);
Iterator<String> xmlIterator = new XMLIterator(splitOnElement, bis);
return StreamSupport
.stream(
Spliterators.spliteratorUnknownSize(xmlIterator, Spliterator.ORDERED),
false);
}
abstract protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException;
public FileSystem getFileSystem() {
return fileSystem;
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.collection.plugin.file;
import java.io.BufferedInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {
private static final Logger log = LoggerFactory.getLogger(FileCollectorPlugin.class);
public FileCollectorPlugin(FileSystem fileSystem) {
super(fileSystem);
}
@Override
protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {
log.info("filePath: {}", filePath);
try {
FileSystem fs = super.getFileSystem();
return new BufferedInputStream(fs.open(filePath));
} catch (Exception e) {
throw new CollectorException("Error reading file " + filePath, e);
}
}
}

View File

@ -0,0 +1,35 @@
package eu.dnetlib.dhp.collection.plugin.file;
import java.io.BufferedInputStream;
import java.util.zip.GZIPInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin {
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPlugin.class);
public FileGZipCollectorPlugin(FileSystem fileSystem) {
super(fileSystem);
}
@Override
protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {
log.info("filePath: {}", filePath);
try {
FileSystem fs = super.getFileSystem();
GZIPInputStream stream = new GZIPInputStream(fs.open(filePath));
return new BufferedInputStream(stream);
} catch (Exception e) {
throw new CollectorException("Error reading file " + filePath, e);
}
}
}

View File

@ -19,7 +19,7 @@ import org.dom4j.io.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.XmlCleaner;
import eu.dnetlib.dhp.collection.plugin.utils.XmlCleaner;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpConnector2;

View File

@ -30,7 +30,7 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import eu.dnetlib.dhp.collection.JsonUtils;
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.collection;
package eu.dnetlib.dhp.collection.plugin.utils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

View File

@ -0,0 +1,177 @@
package eu.dnetlib.dhp.collection.plugin.utils;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.Iterator;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class XMLIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(XMLIterator.class);
private ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() {
@Override
protected XMLInputFactory initialValue() {
return XMLInputFactory.newInstance();
}
};
private ThreadLocal<XMLOutputFactory> outputFactory = new ThreadLocal<XMLOutputFactory>() {
@Override
protected XMLOutputFactory initialValue() {
return XMLOutputFactory.newInstance();
}
};
private ThreadLocal<XMLEventFactory> eventFactory = new ThreadLocal<XMLEventFactory>() {
@Override
protected XMLEventFactory initialValue() {
return XMLEventFactory.newInstance();
}
};
public static final String UTF_8 = "UTF-8";
final XMLEventReader parser;
private XMLEvent current = null;
private String element;
private InputStream inputStream;
public XMLIterator(final String element, final InputStream inputStream) {
super();
this.element = element;
this.inputStream = inputStream;
this.parser = getParser();
try {
this.current = findElement(parser);
} catch (XMLStreamException e) {
log.warn("cannot init parser position. No element found: " + element);
current = null;
}
}
@Override
public boolean hasNext() {
return current != null;
}
@Override
public String next() {
String result = null;
try {
result = copy(parser);
current = findElement(parser);
return result;
} catch (XMLStreamException e) {
throw new RuntimeException(String.format("error copying xml, built so far: '%s'", result), e);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@SuppressWarnings("finally")
private String copy(final XMLEventReader parser) throws XMLStreamException {
final StringWriter result = new StringWriter();
try {
final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(result);
final StartElement start = current.asStartElement();
final StartElement newRecord = eventFactory
.get()
.createStartElement(start.getName(), start.getAttributes(), start.getNamespaces());
// new root record
writer.add(newRecord);
// copy the rest as it is
while (parser.hasNext()) {
final XMLEvent event = parser.nextEvent();
// TODO: replace with depth tracking instead of close tag tracking.
if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
writer.add(event);
break;
}
writer.add(event);
}
writer.close();
} finally {
return result.toString();
}
}
/**
* Looks for the next occurrence of the splitter element.
*
* @param parser
* @return
* @throws XMLStreamException
*/
private XMLEvent findElement(final XMLEventReader parser) throws XMLStreamException {
/*
* if (current != null && element.equals(current.asStartElement().getName().getLocalPart())) { return current; }
*/
XMLEvent peek = parser.peek();
if (peek != null && peek.isStartElement()) {
String name = peek.asStartElement().getName().getLocalPart();
if (element.equals(name)) {
return peek;
}
}
while (parser.hasNext()) {
final XMLEvent event = parser.nextEvent();
if (event != null && event.isStartElement()) {
String name = event.asStartElement().getName().getLocalPart();
if (element.equals(name)) {
return event;
}
}
}
return null;
}
private XMLEventReader getParser() {
try {
return inputFactory.get().createXMLEventReader(sanitize(inputStream));
} catch (XMLStreamException e) {
throw new RuntimeException(e);
}
}
private Reader sanitize(final InputStream in) {
final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
return new InputStreamReader(in, charsetDecoder);
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.collection;
package eu.dnetlib.dhp.collection.plugin.utils;
import java.util.HashMap;
import java.util.HashSet;

View File

@ -17,6 +17,9 @@ public class PMArticle implements Serializable {
* the Pubmed Identifier
*/
private String pmid;
private String pmcId;
/**
* the DOI
*/
@ -122,7 +125,7 @@ public class PMArticle implements Serializable {
/**
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
* Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
* Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
* The NLM journal title abbreviation is exported in the <MedlineTA> element.
*
* @return the pubmed Journal Extracted
@ -140,10 +143,11 @@ public class PMArticle implements Serializable {
}
/**
* English-language abstracts are taken directly from the published article.
* If the article does not have a published abstract, the National Library of Medicine does not create one,
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
* <ArticleTitle> contains the entire title of the journal article. <ArticleTitle> is always in English;
* those titles originally published in a non-English language and translated for <ArticleTitle> are enclosed in square brackets.
* All titles end with a period unless another punctuation mark such as a question mark or bracket is present.
* Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl).
* Corporate/collective authors may appear at the end of <ArticleTitle> for citations up to about the year 2000.
*
* @return the extracted pubmed Title
*/
@ -250,4 +254,13 @@ public class PMArticle implements Serializable {
public List<PMGrant> getGrants() {
return grants;
}
public String getPmcId() {
return pmcId;
}
public PMArticle setPmcId(String pmcId) {
this.pmcId = pmcId;
return this;
}
}

View File

@ -86,7 +86,7 @@
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the unresolved from bip finder!</name>
<name>Produces the unresolved from BIP! Finder</name>
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
@ -135,7 +135,7 @@
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the unresolved from FOS!</name>
<name>Produces the unresolved from FOS</name>
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareFOSSparkJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
@ -185,7 +185,7 @@
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the unresolved from FOS!</name>
<name>Produces the unresolved from FOS</name>
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareSDGSparkJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>

View File

@ -0,0 +1,37 @@
[
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the zipped opencitations file",
"paramRequired": true
},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "the hdfs name node",
"paramRequired": false
},
{
"paramName": "d",
"paramLongName": "delimiter",
"paramDescription": "the hdfs name node",
"paramRequired": false
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the hdfs name node",
"paramRequired": true
},
{
"paramName": "if",
"paramLongName": "inputFile",
"paramDescription": "the hdfs name node",
"paramRequired": true
}
]

View File

@ -26,6 +26,7 @@
<switch>
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
<case to="extract">${wf:conf('resumeFrom') eq 'ExtractContent'}</case>
<case to="read">${wf:conf('resumeFrom') eq 'ReadContent'}</case>
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
</switch>
</decision>
@ -60,6 +61,32 @@
<arg>--inputFile</arg><arg>${inputFile}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</java>
<ok to="read"/>
<error to="Kill"/>
</action>
<action name="read">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the AS for OC</name>
<class>eu.dnetlib.dhp.actionmanager.opencitations.ReadCOCI</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
<arg>--outputPath</arg><arg>${workingPath}/COCI_JSON/</arg>
<arg>--delimiter</arg><arg>${delimiter}</arg>
<arg>--inputFile</arg><arg>${inputFileCoci}</arg>
</spark>
<ok to="create_actionset"/>
<error to="Kill"/>
</action>
@ -81,7 +108,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
<arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>

View File

@ -0,0 +1,23 @@
[
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the path where the projects are stored ",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the path for the extracted folder",
"paramRequired": true
},
{
"paramName": "hnn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the hdfs namenode",
"paramRequired": true
}
]

View File

@ -0,0 +1,3 @@
#!/bin/bash
hdfs dfs -rm $2
curl -LSs $1 | hdfs dfs -put - $2

View File

@ -1,27 +1,9 @@
<workflow-app name="H2020Classification" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>projectFileURL</name>
<description>the url where to get the projects file</description>
</property>
<property>
<name>programmeFileURL</name>
<description>the url where to get the programme file</description>
</property>
<property>
<name>topicFileURL</name>
<description>the url where to get the topic file</description>
</property>
<property>
<name>outputPath</name>
<description>path where to store the action set</description>
</property>
<property>
<name>sheetName</name>
<description>the name of the sheet to read</description>
</property>
</parameters>
<start to="deleteoutputpath"/>
@ -35,40 +17,103 @@
<delete path='${workingDir}'/>
<mkdir path='${workingDir}'/>
</fs>
<ok to="fork_get_info"/>
<ok to="fork_download_info"/>
<error to="Kill"/>
</action>
<fork name="fork_get_info">
<fork name="fork_download_info">
<path start="fork_get_projects"/>
<path start="get_programme_file"/>
<path start="get_topic_file"/>
<path start="download_programme_file"/>
</fork>
<fork name="fork_get_projects">
<path start="get_project_file"/>
<path start="read_projects"/>
<path start="download_projects"/>
<path start="read_projects_from_db"/>
</fork>
<action name="get_project_file">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--fileURL</arg><arg>${projectFileURL}</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/projects</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject</arg>
</java>
<ok to="wait_projects"/>
<action name="download_projects">
<shell xmlns="uri:oozie:shell-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<exec>download.sh</exec>
<argument>${downloadH2020Projects}</argument>
<argument>${projectPath}</argument>
<env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
<file>download.sh</file>
<capture-output/>
</shell>
<ok to="extract_projects"/>
<error to="Kill"/>
</action>
<action name="get_programme_file">
<action name="extract_projects">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ExtractFromZip</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--inputPath</arg><arg>${projectPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/</arg>
</java>
<ok to="read_from_folder"/>
<error to="Kill"/>
</action>
<fork name="read_from_folder">
<path start="read_projects"/>
<path start="read_topic_file"/>
</fork>
<action name="read_projects">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadProjects</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--inputPath</arg><arg>${workingDir}/json/project.json</arg>
<arg>--outputPath</arg><arg>${workingDir}/projects</arg>
</java>
<ok to="wait_read_from_folder"/>
<error to="Kill"/>
</action>
<action name="download_programme_file">
<shell xmlns="uri:oozie:shell-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<exec>download.sh</exec>
<argument>${downloadH2020Programme}</argument>
<argument>${programmePath}</argument>
<env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
<file>download.sh</file>
<capture-output/>
</shell>
<ok to="extract_programme"/>
<error to="Kill"/>
</action>
<action name="extract_programme">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ExtractFromZip</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--inputPath</arg><arg>${programmePath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/downloadedProgramme/</arg>
</java>
<ok to="read_programme"/>
<error to="Kill"/>
</action>
<action name="read_programme">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--fileURL</arg><arg>${programmeFileURL}</arg>
<arg>--fileURL</arg><arg>${workingDir}/downloadedProgramme/csv/programme.csv</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/programme</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme</arg>
</java>
@ -76,20 +121,18 @@
<error to="Kill"/>
</action>
<action name="get_topic_file">
<action name="read_topic_file">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadExcel</main-class>
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadTopics</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--fileURL</arg><arg>${topicFileURL}</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/topic</arg>
<arg>--sheetName</arg><arg>${sheetName}</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic</arg>
<arg>--inputPath</arg><arg>${workingDir}/json/topics.json</arg>
<arg>--outputPath</arg><arg>${workingDir}/topic</arg>
</java>
<ok to="wait"/>
<ok to="wait_read_from_folder"/>
<error to="Kill"/>
</action>
<action name="read_projects">
<action name="read_projects_from_db">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.project.ReadProjectsFromDB</main-class>
<arg>--hdfsPath</arg><arg>${workingDir}/dbProjects</arg>
@ -123,9 +166,11 @@
<arg>--outputPath</arg><arg>${workingDir}/preparedProgramme</arg>
</spark>
<ok to="wait"/>
<!-- <ok to="End"/>-->
<error to="Kill"/>
</action>
<join name="wait_read_from_folder" to="wait_projects"/>
<join name="wait" to="create_updates"/>
<join name="wait_projects" to="prepare_project"/>
@ -153,6 +198,7 @@
<arg>--dbProjectPath</arg><arg>${workingDir}/dbProjects</arg>
</spark>
<ok to="wait"/>
<!-- <ok to="End"/>-->
<error to="Kill"/>
</action>

View File

@ -0,0 +1,23 @@
[
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the path where the projects are stored ",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the path for the extracted folder",
"paramRequired": true
},
{
"paramName": "hnn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the hdfs namenode",
"paramRequired": true
}
]

View File

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "hmu",
"paramLongName": "hive_metastore_uris",
"paramDescription": "the URI for the hive metastore",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "sdb",
"paramLongName": "usagestatsdb",
"paramDescription": "the name of the db to be used",
"paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the workingPath where to save the content of the usage_stats table",
"paramRequired": true
}
]

View File

@ -1,25 +1,12 @@
<workflow-app name="sub_dump_community_products" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="UsageStatsCounts" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
<description>the path where to store the actionset</description>
</property>
<property>
<name>hiveDbName</name>
<description>the target hive database name</description>
</property>
<property>
<name>hiveJdbcUrl</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hiveMetastoreUris</name>
<description>hive server metastore URIs</description>
<name>usagestatsdb</name>
<description>the name of the db to be used</description>
</property>
<property>
<name>sparkDriverMemory</name>
@ -76,50 +63,19 @@
</configuration>
</global>
<start to="common_action_community_funder"/>
<start to="atomicactions"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="common_action_community_funder">
<sub-workflow>
<app-path>${wf:appPath()}/dump_common
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${sourcePath}</value>
</property>
<property>
<name>selectedResults</name>
<value>${sourcePath}</value>
</property>
<property>
<name>communityMapPath</name>
<value>${workingDir}/communityMap</value>
</property>
<property>
<name>outputPath</name>
<value>${workingDir}</value>
</property>
</configuration>
</sub-workflow>
<ok to="splitForCommunities" />
<error to="Kill" />
</action>
<action name="splitForCommunities">
<action name="atomicactions">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Split dumped result for community</name>
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkSplitForCommunity</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<name>Produces the atomic action with the usage stats count for results</name>
<class>eu.dnetlib.dhp.actionmanager.usagestats.SparkAtomicActionUsageJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
@ -130,16 +86,14 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/ext</arg>
<arg>--hive_metastore_uris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
<arg>--usagestatsdb</arg><arg>${usagestatsdb}</arg>
<arg>--workingPath</arg><arg>${workingDir}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,63 @@
from urllib.request import urlopen
import json
def retrieve_datacite_clients(base_url):
datacite_clients = {}
while base_url is not None:
with urlopen(base_url) as response:
print(f"requesting {base_url}")
response_content = response.read()
data = json.loads(response_content)
if 'data' in data and len(data['data'])>0:
for item in data['data']:
datacite_clients[item['id'].lower()]= item['attributes']['re3data'].lower().replace("https://doi.org/","")
base_url = data['links']['next']
else:
base_url = None
return datacite_clients
def retrieve_r3data(start_url):
r3data_clients = {}
page_number = 1
base_url = start_url
while base_url is not None:
with urlopen(base_url) as response:
print(f"requesting {base_url}")
response_content = response.read()
data = json.loads(response_content)
if 'data' in data and len(data['data'])>0:
for item in data['data']:
r3data_clients[item['id'].lower()]= dict(
openaire_id= "re3data_____::"+item['attributes']['re3dataId'].lower(),
official_name=item['attributes']['repositoryName']
)
page_number +=1
base_url = f"{start_url}&page[number]={page_number}"
else:
base_url = None
return r3data_clients
base_url ="https://api.datacite.org/clients?query=re3data_id:*&page[size]=250"
dc = retrieve_datacite_clients(base_url)
r3 = retrieve_r3data("https://api.datacite.org/re3data?page[size]=250")
result = {}
for item in dc:
res = dc[item].lower()
if res not in r3:
print(f"missing {res} for {item} in dictionary")
else:
result[item.upper()]= dict(openaire_id=r3[res]["openaire_id"],datacite_name=r3[res]["official_name"], official_name=r3[res]["official_name"] )
with open('hostedBy_map.json', 'w', encoding='utf8') as json_file:
json.dump(result, json_file, ensure_ascii=False, indent=1)

View File

@ -7,16 +7,14 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode}
object CollectionUtils {
/**
* This method in pipeline to the transformation phase,
* generates relations in both verse, typically it should be a phase of flatMap
*
* @param i input OAF
* @return
* If the input OAF is an entity -> List(i)
* If the input OAF is a relation -> List(relation, inverseRelation)
*
*/
/** This method in pipeline to the transformation phase,
* generates relations in both verse, typically it should be a phase of flatMap
*
* @param i input OAF
* @return
* If the input OAF is an entity -> List(i)
* If the input OAF is a relation -> List(relation, inverseRelation)
*/
def fixRelations(i: Oaf): List[Oaf] = {
if (i.isInstanceOf[OafEntity])

View File

@ -6,7 +6,6 @@ import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClientBuilder
abstract class AbstractRestClient extends Iterator[String] {
var buffer: List[String] = List()
@ -16,12 +15,10 @@ abstract class AbstractRestClient extends Iterator[String] {
var complete: Boolean = false
def extractInfo(input: String): Unit
protected def getBufferData(): Unit
def doHTTPGETRequest(url: String): String = {
val httpGet = new HttpGet(url)
doHTTPRequest(httpGet)
@ -43,7 +40,6 @@ abstract class AbstractRestClient extends Iterator[String] {
buffer.nonEmpty && current_index < buffer.size
}
override def next(): String = {
val next_item: String = buffer(current_index)
current_index = current_index + 1
@ -52,13 +48,14 @@ abstract class AbstractRestClient extends Iterator[String] {
next_item
}
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
val timeout = 60; // seconds
val config = RequestConfig.custom()
val timeout = 600; // seconds
val config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build()
.setSocketTimeout(timeout * 1000)
.build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try {
var tries = 4
@ -69,8 +66,7 @@ abstract class AbstractRestClient extends Iterator[String] {
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
}
else
} else
return IOUtils.toString(response.getEntity.getContent)
} catch {
case e: Throwable =>
@ -87,4 +83,4 @@ abstract class AbstractRestClient extends Iterator[String] {
}
getBufferData()
}
}

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.datacite
import org.json4s.jackson.JsonMethods.{compact, parse, render}
import org.json4s.{DefaultFormats, JValue}
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long = -1) extends AbstractRestClient {
override def extractInfo(input: String): Unit = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -16,16 +16,18 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -
current_index = 0
}
def get_url():String ={
val to = if (until> 0) s"$until" else "*"
def get_url(): String = {
val to = if (until > 0) s"$until" else "*"
s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]"
}
override def getBufferData(): Unit = {
if (!complete) {
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
val response =
if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get)
else doHTTPGETRequest(get_url())
extractInfo(response)
}
}
}
}

View File

@ -10,24 +10,38 @@ import java.util.Locale
import java.util.regex.Pattern
import scala.io.Source
/**
* This class represent the dataModel of the input Dataset of Datacite
* @param doi THE DOI
* @param timestamp timestamp of last update date
* @param isActive the record is active or deleted
* @param json the json native records
*/
/** This class represent the dataModel of the input Dataset of Datacite
* @param doi THE DOI
* @param timestamp timestamp of last update date
* @param isActive the record is active or deleted
* @param json the json native records
*/
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
/*
The following class are utility class used for the mapping from
json datacite to OAF Shema
*/
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
case class RelatedIdentifierType(
relationType: String,
relatedIdentifier: String,
relatedIdentifierType: String
) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class NameIdentifiersType(
nameIdentifierScheme: Option[String],
schemeUri: Option[String],
nameIdentifier: Option[String]
) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class CreatorType(
nameType: Option[String],
nameIdentifiers: Option[List[NameIdentifiersType]],
name: Option[String],
familyName: Option[String],
givenName: Option[String],
affiliation: Option[List[String]]
) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
@ -35,100 +49,210 @@ case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class FundingReferenceType(
funderIdentifierType: Option[String],
awardTitle: Option[String],
awardUri: Option[String],
funderName: Option[String],
funderIdentifier: Option[String],
awardNumber: Option[String]
) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class OAFRelations(relation:String, inverse:String, relType:String)
case class OAFRelations(relation: String, inverse: String, relType: String)
class DataciteModelConstants extends Serializable {
}
class DataciteModelConstants extends Serializable {}
object DataciteModelConstants {
val REL_TYPE_VALUE:String = "resultResult"
val REL_TYPE_VALUE: String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val DATACITE_NAME = "Datacite"
val dataInfo: DataInfo = dataciteDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
val subRelTypeMapping: Map[String,OAFRelations] = Map(
ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
val DATACITE_COLLECTED_FROM: KeyValue =
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
val subRelTypeMapping: Map[String, OAFRelations] = Map(
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
ModelConstants.IS_SUPPLEMENTED_BY,
ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.SUPPLEMENT
),
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.IS_SUPPLEMENTED_BY,
ModelConstants.SUPPLEMENT
),
ModelConstants.HAS_PART -> OAFRelations(
ModelConstants.HAS_PART,
ModelConstants.IS_PART_OF,
ModelConstants.PART
),
ModelConstants.IS_PART_OF -> OAFRelations(
ModelConstants.IS_PART_OF,
ModelConstants.HAS_PART,
ModelConstants.PART
),
ModelConstants.IS_VERSION_OF -> OAFRelations(
ModelConstants.IS_VERSION_OF,
ModelConstants.HAS_VERSION,
ModelConstants.VERSION
),
ModelConstants.HAS_VERSION -> OAFRelations(
ModelConstants.HAS_VERSION,
ModelConstants.IS_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
ModelConstants.IS_IDENTICAL_TO,
ModelConstants.IS_IDENTICAL_TO,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
ModelConstants.IS_CONTINUED_BY,
ModelConstants.CONTINUES,
ModelConstants.RELATIONSHIP
),
ModelConstants.CONTINUES -> OAFRelations(
ModelConstants.CONTINUES,
ModelConstants.IS_CONTINUED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.IS_PREVIOUS_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
ModelConstants.IS_PREVIOUS_VERSION_OF,
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.DOCUMENTS,
ModelConstants.RELATIONSHIP
),
ModelConstants.DOCUMENTS -> OAFRelations(
ModelConstants.DOCUMENTS,
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_SOURCE_OF -> OAFRelations(
ModelConstants.IS_SOURCE_OF,
ModelConstants.IS_DERIVED_FROM,
ModelConstants.VERSION
),
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
ModelConstants.IS_DERIVED_FROM,
ModelConstants.IS_SOURCE_OF,
ModelConstants.VERSION
),
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
ModelConstants.IS_VARIANT_FORM_OF,
ModelConstants.IS_DERIVED_FROM,
ModelConstants.VERSION
),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
ModelConstants.IS_OBSOLETED_BY,
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.REVIEWS -> OAFRelations(
ModelConstants.REVIEWS,
ModelConstants.IS_REVIEWED_BY,
ModelConstants.REVIEW
),
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
ModelConstants.IS_REVIEWED_BY,
ModelConstants.REVIEWS,
ModelConstants.REVIEW
),
ModelConstants.DOCUMENTS -> OAFRelations(
ModelConstants.DOCUMENTS,
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.DOCUMENTS,
ModelConstants.RELATIONSHIP
),
ModelConstants.COMPILES -> OAFRelations(
ModelConstants.COMPILES,
ModelConstants.IS_COMPILED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_COMPILED_BY -> OAFRelations(
ModelConstants.IS_COMPILED_BY,
ModelConstants.COMPILES,
ModelConstants.RELATIONSHIP
)
)
val datacite_filter: List[String] = {
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
require(stream!= null)
require(stream != null)
Source.fromInputStream(stream).getLines().toList
}
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
trust
)
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
Locale.ENGLISH
)
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val df_it: DateTimeFormatter =
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda__h2020::"
),
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda_______::"
)
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
Pattern.compile(
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
Pattern.MULTILINE
),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
Pattern.compile(
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
Pattern.MULTILINE
),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
Pattern.compile(
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
Pattern.MULTILINE
),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
}

View File

@ -19,22 +19,46 @@ import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import scala.collection.JavaConverters._
import scala.io.Source
object DataciteToOAFTransformation {
case class HostedByMapType(
openaire_id: String,
datacite_name: String,
official_name: String,
similarity: Option[Float]
) {}
val mapper = new ObjectMapper()
val unknown_repository: HostedByMapType = HostedByMapType(
ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID,
ModelConstants.UNKNOWN_REPOSITORY.getValue,
ModelConstants.UNKNOWN_REPOSITORY.getValue,
Some(1.0f)
)
val hostedByMap: Map[String, HostedByMapType] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(s)
json.extract[Map[String, HostedByMapType]]
}
/** This method should skip record if json contains invalid text
* defined in file datacite_filter
*
* @param record : not parsed Datacite record
* @param json : parsed record
* @return True if the record should be skipped
*/
def skip_record(record: String, json: org.json4s.JValue): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
datacite_filter.exists(f => record.contains(f)) || (json \\ "publisher")
.extractOrElse[String]("")
.equalsIgnoreCase("FAIRsharing")
/**
* This method should skip record if json contains invalid text
* defined in gile datacite_filter
*
* @param json
* @return True if the record should be skipped
*/
def skip_record(json: String): Boolean = {
datacite_filter.exists(f => json.contains(f))
}
@deprecated("this method will be removed", "dhp")
@ -74,35 +98,39 @@ object DataciteToOAFTransformation {
}
/** This utility method indicates whether the embargo date has been reached
* @param embargo_end_date
* @return True if the embargo date has been reached, false otherwise
*/
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
td.isAfter(dt)
}
def extract_date(input: String): Option[String] = {
val d = Date_regex.map(pattern => {
val matcher = pattern.matcher(input)
if (matcher.find())
matcher.group(0)
else
null
}
).find(s => s != null)
val d = Date_regex
.map(pattern => {
val matcher = pattern.matcher(input)
if (matcher.find())
matcher.group(0)
else
null
})
.find(s => s != null)
if (d.isDefined) {
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
try {
return Some(LocalDate.parse(a_date, df_en).toString)
} catch {
case _: Throwable => try {
return Some(LocalDate.parse(a_date, df_it).toString)
} catch {
case _: Throwable =>
return None
}
case _: Throwable =>
try {
return Some(LocalDate.parse(a_date, df_it).toString)
} catch {
case _: Throwable =>
return None
}
}
}
d
@ -118,31 +146,78 @@ object DataciteToOAFTransformation {
}
}
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
/** *
* Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type.
* Using the dnet:result_typologies vocabulary, we look up the instance.type synonym
* to generate one of the following main entities:
* - publication
* - dataset
* - software
* - otherresearchproduct
*
* @param resourceType
* @param resourceTypeGeneral
* @param schemaOrg
* @param vocabularies
* @return
*/
def getTypeQualifier(
resourceType: String,
resourceTypeGeneral: String,
schemaOrg: String,
vocabularies: VocabularyGroup
): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
if (schemaOrg != null && schemaOrg.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
val typeQualifier = vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_PUBLICATION_RESOURCE,
resourceTypeGeneral
)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
null
}
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
def getResult(
resourceType: String,
resourceTypeGeneral: String,
schemaOrg: String,
vocabularies: VocabularyGroup
): Result = {
val typeQualifiers: (Qualifier, Qualifier) =
getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (typeQualifiers == null)
return null
val i = new Instance
@ -168,13 +243,12 @@ object DataciteToOAFTransformation {
null
}
def available_date(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val l: List[String] = for {
JObject(dates) <- json \\ "dates"
JObject(dates) <- json \\ "dates"
JField("dateType", JString(dateTypes)) <- dates
} yield dateTypes
@ -182,29 +256,28 @@ object DataciteToOAFTransformation {
}
/**
* As describe in ticket #6377
* when the result come from figshare we need to remove subject
* and set Access rights OPEN.
*
* @param r
*/
/** As describe in ticket #6377
* when the result come from figshare we need to remove subject
* and set Access rights OPEN.
*
* @param r
*/
def fix_figshare(r: Result): Unit = {
if (r.getInstance() != null) {
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
val hosted_by_figshare = r
.getInstance()
.asScala
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
val l: List[StructuredProperty] = List()
val l: List[Subject] = List()
r.setSubject(l.asJava)
}
}
}
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
@ -214,7 +287,13 @@ object DataciteToOAFTransformation {
OafMapperUtils.structuredProperty(dt, q, null)
}
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
def generateRelation(
sourceId: String,
targetId: String,
relClass: String,
cf: KeyValue,
di: DataInfo
): Relation = {
val r = new Relation
r.setSource(sourceId)
@ -226,7 +305,6 @@ object DataciteToOAFTransformation {
r.setDataInfo(di)
r
}
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
@ -238,22 +316,28 @@ object DataciteToOAFTransformation {
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
}
else
} else
List()
}
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
if (skip_record(input))
return List()
def generateOAF(
input: String,
ts: Long,
dateOfCollection: Long,
vocabularies: VocabularyGroup,
exportLinks: Boolean
): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
if (skip_record(input, json))
return List()
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val resourceTypeGeneral =
(json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
val doi = (json \ "attributes" \ "doi").extract[String]
@ -265,63 +349,92 @@ object DataciteToOAFTransformation {
if (result == null)
return List()
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
// DOI is mapped on a PID inside a Instance object
val doi_q = OafMapperUtils.qualifier(
"doi",
"doi",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES
)
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
result.setPid(List(pid).asJava)
// This identifiere will be replaced in a second moment using the PID logic generation
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
result.setOriginalId(List(doi).asJava)
val d = new Date(dateOfCollection * 1000)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
result.setDateofcollection(ISO8601FORMAT.format(d))
result.setDateoftransformation(ISO8601FORMAT.format(d))
result.setDataInfo(dataInfo)
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
val authors = creators.zipWithIndex.map { case (c, idx) =>
val a = new Author
a.setFullname(c.name.orNull)
a.setName(c.givenName.orNull)
a.setSurname(c.familyName.orNull)
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
a.setPid(c.nameIdentifiers.get.map(ni => {
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
}
else
null
a.setPid(
c.nameIdentifiers.get
.map(ni => {
val q =
if (ni.nameIdentifierScheme.isDefined)
vocabularies.getTermAsQualifier(
ModelConstants.DNET_PID_TYPES,
ni.nameIdentifierScheme.get.toLowerCase()
)
else null
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
} else
null
}
})
.asJava
)
.asJava)
}
if (c.affiliation.isDefined)
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
a.setAffiliation(
c.affiliation.get
.filter(af => af.nonEmpty)
.map(af => OafMapperUtils.field(af, dataInfo))
.asJava
)
a.setRank(idx + 1)
a
}
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
if (t.titleType.isEmpty) {
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
} else {
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
}
}).asJava)
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
return List()
result.setAuthor(authors.asJava)
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(
titles
.filter(t => t.title.nonEmpty)
.map(t => {
if (t.titleType.isEmpty) {
OafMapperUtils
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
} else {
OafMapperUtils.structuredProperty(
t.title.get,
t.titleType.get,
t.titleType.get,
ModelConstants.DNET_DATACITE_TITLE,
ModelConstants.DNET_DATACITE_TITLE,
null
)
}
})
.asJava
)
val dates = (json \\ "dates").extract[List[DateType]]
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
@ -337,46 +450,81 @@ object DataciteToOAFTransformation {
if (a_date.isDefined) {
if (doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null))
result.setEmbargoenddate(
OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)
)
else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
if (doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
}
else {
result.setDateofacceptance(
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
)
result
.getInstance()
.get(0)
.setDateofacceptance(
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
)
} else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
}
}
else if (publication_year != null) {
} else if (publication_year != null) {
if (doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
result.setDateofacceptance(
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
)
result
.getInstance()
.get(0)
.setDateofacceptance(
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
)
} else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result
.getInstance()
.get(0)
.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
}
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined)
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
.filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2)).asJava)
result.setRelevantdate(
dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined)
.map(d =>
(
d._1.get,
vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
)
)
.filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2))
.asJava
)
val subjects = (json \\ "subjects").extract[List[SubjectType]]
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
.map(s =>
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
).asJava)
result.setSubject(
subjects
.filter(s => s.subject.nonEmpty)
.map(s =>
OafMapperUtils.subject(
s.subject.get,
SUBJ_CLASS,
SUBJ_CLASS,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
null
)
)
.asJava
)
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
@ -384,66 +532,88 @@ object DataciteToOAFTransformation {
result.setDescription(
descriptions
.filter(d => d.description.isDefined).
map(d =>
OafMapperUtils.field(d.description.get, null)
).filter(s => s != null).asJava)
.filter(d => d.description.isDefined)
.map(d => OafMapperUtils.field(d.description.get, null))
.filter(s => s != null)
.asJava
)
val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null))
val language: String = (json \\ "language").extractOrElse[String](null)
if (language != null)
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
result.setLanguage(
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
)
val instance = result.getInstance().get(0)
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
val accessRights: List[String] = for {
JObject(rightsList) <- json \\ "rightsList"
JObject(rightsList) <- json \\ "rightsList"
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
val aRights: Option[AccessRight] = accessRights.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
}).find(q => q != null).map(q => {
val a = new AccessRight
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
a.setSchemeid(q.getSchemeid)
a.setSchemename(q.getSchemename)
a
})
val aRights: Option[AccessRight] = accessRights
.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
})
.find(q => q != null)
.map(q => {
val a = new AccessRight
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
a.setSchemeid(q.getSchemeid)
a.setSchemename(q.getSchemename)
a
})
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
val access_rights_qualifier =
if (aRights.isDefined) aRights.get
else
OafMapperUtils.accessRight(
ModelConstants.UNKNOWN,
ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
if (client.isDefined) {
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue))
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier)
instance.setPid(result.getPid)
val license = accessRights
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
.find(r =>
r.startsWith("http") && r.matches(
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
)
)
if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null))
}
val awardUris: List[String] = for {
JObject(fundingReferences) <- json \\ "fundingReferences"
JObject(fundingReferences) <- json \\ "fundingReferences"
JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri
val oid = result.getId
result.setId(IdentifierFactory.createIdentifier(result))
var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
if (!result.getId.equalsIgnoreCase(oid)) {
result.setOriginalId((oid :: List(doi)).asJava)
}
var relations: List[Relation] =
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result)
@ -452,51 +622,81 @@ object DataciteToOAFTransformation {
if (exportLinks) {
val rels: List[RelatedIdentifierType] = for {
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
JField("relationType", JString(relationType)) <- relIdentifier
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
JField("relationType", JString(relationType)) <- relIdentifier
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
relations = relations ::: generateRelations(rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
relations = relations ::: generateRelations(
rels,
result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
)
}
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
}
else
} else
List(result)
}
private def generateRelations(rels: List[RelatedIdentifierType], id: String, date: String): List[Relation] = {
rels
private def generateRelations(
rels: List[RelatedIdentifierType],
id: String,
date: String
): List[Relation] = {
val bidirectionalRels: List[Relation] = rels
.filter(r =>
subRelTypeMapping.contains(r.relationType) && (
r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
subRelTypeMapping
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
)
.map(r => {
val rel = new Relation
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.setDataInfo(dataInfo)
val subRelType = subRelTypeMapping(r.relationType).relType
rel.setRelType(REL_TYPE_VALUE)
rel.setSubRelType(subRelType)
rel.setRelClass(r.relationType)
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
rel.setSource(id)
rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType))
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel
val target = DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
relation(id, target, subRelType, r.relationType, date)
})
val citationRels: List[Relation] = rels
.filter(r =>
(r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv")) &&
(r.relationType.toLowerCase.contains("cite") || r.relationType.toLowerCase.contains("reference"))
)
.map(r => {
r.relationType match {
case ModelConstants.CITES | ModelConstants.REFERENCES =>
val target = DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
relation(id, target, ModelConstants.CITATION, ModelConstants.CITES, date)
case ModelConstants.IS_CITED_BY | ModelConstants.IS_REFERENCED_BY =>
val source = DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
relation(source, id, ModelConstants.CITATION, ModelConstants.CITES, date)
}
})
citationRels ::: bidirectionalRels
}
def relation(source: String, target: String, subRelType: String, relClass: String, date: String): Relation = {
val rel = new Relation
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.setDataInfo(dataInfo)
rel.setRelType(REL_TYPE_VALUE)
rel.setSubRelType(subRelType)
rel.setRelClass(relClass)
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
rel.setSource(source)
rel.setTarget(target)
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
@ -504,5 +704,4 @@ object DataciteToOAFTransformation {
s"10|$b::${DHPUtils.md5(a)}"
}
}

View File

@ -12,12 +12,12 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val sourcePath = parser.get("sourcePath")
@ -46,49 +46,65 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
reportTotalSize(targetPath, outputBasePath)
}
/**
* For working with MDStore we need to store in a file on hdfs the size of
* the current dataset
* @param targetPath
* @param outputBasePath
*/
def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = {
/** For working with MDStore we need to store in a file on hdfs the size of
* the current dataset
* @param targetPath
* @param outputBasePath
*/
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
val total_items = spark.read.text(targetPath).count()
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
writeHdfsFile(
spark.sparkContext.hadoopConfiguration,
s"$total_items",
outputBasePath + MDSTORE_SIZE_PATH
)
}
/**
* Generate the transformed and cleaned OAF Dataset from the native one
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
* @param exportLinks If true it generates unresolved links
* @param vocabularies vocabularies for cleaning
* @param targetPath the targetPath of the result Dataset
*/
def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = {
require(spark!= null)
/** Generate the transformed and cleaned OAF Dataset from the native one
*
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
* @param exportLinks If true it generates unresolved links
* @param vocabularies vocabularies for cleaning
* @param targetPath the targetPath of the result Dataset
*/
def generateDataciteDataset(
sourcePath: String,
exportLinks: Boolean,
vocabularies: VocabularyGroup,
targetPath: String,
spark: SparkSession
): Unit = {
require(spark != null)
import spark.implicits._
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
CollectionUtils.saveDataset(
spark.read.load(sourcePath).as[DataciteType]
spark.read
.load(sourcePath)
.as[DataciteType]
.filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
.flatMap(d =>
DataciteToOAFTransformation
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)
)
.filter(d => d != null),
targetPath)
targetPath
)
}
}
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run()
new GenerateDataciteDatasetSpark(
"/eu/dnetlib/dhp/datacite/generate_dataset_params.json",
args,
log
).initialize().run()
}
}

View File

@ -22,7 +22,6 @@ object ImportDatacite {
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
def convertAPIStringToDataciteItem(input: String): DataciteType = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
@ -32,14 +31,26 @@ object ImportDatacite {
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
DataciteType(
doi = doi,
timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000,
isActive = isActive,
json = input
)
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/datacite/import_from_api.json"
)
)
.mkString
)
parser.parseArgument(args)
val master = parser.get("master")
@ -60,7 +71,8 @@ object ImportDatacite {
val spkipImport = parser.get("skipImport")
log.info(s"skipImport is $spkipImport")
val spark: SparkSession = SparkSession.builder()
val spark: SparkSession = SparkSession
.builder()
.appName(ImportDatacite.getClass.getSimpleName)
.master(master)
.getOrCreate()
@ -78,45 +90,48 @@ object ImportDatacite {
import spark.implicits._
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] =
new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
override def zero: DataciteType = null
override def zero: DataciteType = null
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
if (b == null)
return a
if (a == null)
return b
if (a.timestamp > b.timestamp) {
return a
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
if (b == null)
return a
if (a == null)
return b
if (a.timestamp > b.timestamp) {
return a
}
b
}
b
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
reduce(a, b)
}
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
override def finish(reduction: DataciteType): DataciteType = reduction
}
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
reduce(a, b)
}
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
override def finish(reduction: DataciteType): DataciteType = reduction
}
val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
val ts = dump.select(max("timestamp")).first().getLong(0)
println(s"last Timestamp is $ts")
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
val cnt =
if ("true".equalsIgnoreCase(spkipImport)) 1
else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
println(s"Imported from Datacite API $cnt documents")
if (cnt > 0) {
val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
val inputRdd: RDD[DataciteType] = sc
.sequenceFile(targetPath, classOf[Int], classOf[Text])
.map(s => s._2.toString)
.map(s => convertAPIStringToDataciteItem(s))
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
@ -129,7 +144,9 @@ object ImportDatacite {
.agg(dataciteAggregator.toColumn)
.map(s => s._2)
.repartition(4000)
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
.write
.mode(SaveMode.Overwrite)
.save(s"${dataciteDump}_updated")
val fs = FileSystem.get(sc.hadoopConfiguration)
fs.delete(new Path(s"$dataciteDump"), true)
@ -137,14 +154,24 @@ object ImportDatacite {
}
}
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = {
private def writeSequenceFile(
hdfsTargetPath: Path,
timestamp: Long,
conf: Configuration,
bs: Int
): Long = {
var from: Long = timestamp * 1000
val delta: Long = 100000000L
var client: DataciteAPIImporter = null
val now: Long = System.currentTimeMillis()
var i = 0
try {
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
val writer = SequenceFile.createWriter(
conf,
SequenceFile.Writer.file(hdfsTargetPath),
SequenceFile.Writer.keyClass(classOf[IntWritable]),
SequenceFile.Writer.valueClass(classOf[Text])
)
try {
var start: Long = System.currentTimeMillis
while (from < now) {
@ -153,16 +180,16 @@ object ImportDatacite {
val key: IntWritable = new IntWritable(i)
val value: Text = new Text
while (client.hasNext) {
key.set({
key.set {
i += 1;
i - 1
})
}
value.set(client.next())
writer.append(key, value)
writer.hflush()
if (i % 1000 == 0) {
end = System.currentTimeMillis
val time = (end - start) / 1000.0F
val time = (end - start) / 1000.0f
println(s"Imported $i in $time seconds")
start = System.currentTimeMillis
}
@ -174,8 +201,7 @@ object ImportDatacite {
case e: Throwable =>
println("Error", e)
} finally if (writer != null) writer.close()
}
catch {
} catch {
case e: Throwable =>
log.error("Error", e)
}

Some files were not shown because too many files have changed in this diff Show More