Compare commits

..

10 Commits

120 changed files with 4888 additions and 8223 deletions

View File

@ -0,0 +1,53 @@
package eu.dnetlib.dhp.common.api;
import java.io.IOException;
import java.io.InputStream;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.internal.Util;
import okio.BufferedSink;
import okio.Okio;
import okio.Source;
public class InputStreamRequestBody extends RequestBody {
private final InputStream inputStream;
private final MediaType mediaType;
private final long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
return new InputStreamRequestBody(inputStream, mediaType, len);
}
private InputStreamRequestBody(InputStream inputStream, MediaType mediaType, long len) {
this.inputStream = inputStream;
this.mediaType = mediaType;
this.lenght = len;
}
@Override
public MediaType contentType() {
return mediaType;
}
@Override
public long contentLength() {
return lenght;
}
@Override
public void writeTo(BufferedSink sink) throws IOException {
Source source = null;
try {
source = Okio.source(inputStream);
sink.writeAll(source);
} finally {
Util.closeQuietly(source);
}
}
}

View File

@ -0,0 +1,8 @@
package eu.dnetlib.dhp.common.api;
public class MissingConceptDoiException extends Throwable {
public MissingConceptDoiException(String message) {
super(message);
}
}

View File

@ -0,0 +1,365 @@
package eu.dnetlib.dhp.common.api;
import java.io.*;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHeaders;
import org.apache.http.entity.ContentType;
import org.jetbrains.annotations.NotNull;
import com.google.gson.Gson;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
import okhttp3.*;
public class ZenodoAPIClient implements Serializable {
String urlString;
String bucket;
String deposition_id;
String access_token;
public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");
private static final MediaType MEDIA_TYPE_ZIP = MediaType.parse("application/zip");
public String getUrlString() {
return urlString;
}
public void setUrlString(String urlString) {
this.urlString = urlString;
}
public String getBucket() {
return bucket;
}
public void setBucket(String bucket) {
this.bucket = bucket;
}
public void setDeposition_id(String deposition_id) {
this.deposition_id = deposition_id;
}
public ZenodoAPIClient(String urlString, String access_token) {
this.urlString = urlString;
this.access_token = access_token;
}
/**
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
*
* @return response code
* @throws IOException
*/
public int newDeposition() throws IOException {
String json = "{}";
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setRequestMethod("POST");
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
this.bucket = newSubmission.getLinks().getBucket();
this.deposition_id = newSubmission.getId();
return responseCode;
}
/**
* Upload files in Zenodo.
*
* @param is the inputStream for the file to upload
* @param file_name the name of the file as it will appear on Zenodo
* @return the response code
*/
public int uploadIS(InputStream is, String file_name) throws IOException {
URL url = new URL(bucket + "/" + file_name);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("PUT");
byte[] buf = new byte[8192];
int length;
try (OutputStream os = conn.getOutputStream()) {
while ((length = is.read(buf)) != -1) {
os.write(buf, 0, length);
}
}
int responseCode = conn.getResponseCode();
if (!checkOKStatus(responseCode)) {
throw new IOException("Unexpected code " + responseCode + getBody(conn));
}
return responseCode;
}
@NotNull
private String getBody(HttpURLConnection conn) throws IOException {
String body = "{}";
try (BufferedReader br = new BufferedReader(
new InputStreamReader(conn.getInputStream(), "utf-8"))) {
StringBuilder response = new StringBuilder();
String responseLine = null;
while ((responseLine = br.readLine()) != null) {
response.append(responseLine.trim());
}
body = response.toString();
}
return body;
}
/**
* Associates metadata information to the current deposition
*
* @param metadata the metadata
* @return response code
* @throws IOException
*/
public int sendMretadata(String metadata) throws IOException {
URL url = new URL(urlString + "/" + deposition_id);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("PUT");
try (OutputStream os = conn.getOutputStream()) {
byte[] input = metadata.getBytes("utf-8");
os.write(input, 0, input.length);
}
final int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + getBody(conn));
return responseCode;
}
private boolean checkOKStatus(int responseCode) {
if (HttpURLConnection.HTTP_OK != responseCode ||
HttpURLConnection.HTTP_CREATED != responseCode)
return true;
return false;
}
/**
* To publish the current deposition. It works for both new deposition or new version of an old deposition
*
* @return response code
* @throws IOException
*/
@Deprecated
public int publish() throws IOException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/publish")
.addHeader("Authorization", "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
}
}
/**
* To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
* for the new version.
*
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
* concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
setDepositionId(concept_rec_id, 1);
String json = "{}";
URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("POST");
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
String latest_draft = zenodoModel.getLinks().getLatest_draft();
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
bucket = getBucket(latest_draft);
return responseCode;
}
/**
* To finish uploading a version or new deposition not published
* It sets the deposition_id and the bucket to be used
*
*
* @param deposition_id the deposition id of the not yet published upload
* concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/
public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
this.deposition_id = deposition_id;
String json = "{}";
URL url = new URL(urlString + "/" + deposition_id);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setRequestMethod("POST");
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
bucket = zenodoModel.getLinks().getBucket();
return responseCode;
}
private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
ZenodoModelList zenodoModelList = new Gson()
.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);
for (ZenodoModel zm : zenodoModelList) {
if (zm.getConceptrecid().equals(concept_rec_id)) {
deposition_id = zm.getId();
return;
}
}
if (zenodoModelList.size() == 0)
throw new MissingConceptDoiException(
"The concept record id specified was missing in the list of depositions");
setDepositionId(concept_rec_id, page + 1);
}
private String getPrevDepositions(String page) throws IOException {
HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
urlBuilder.addQueryParameter("page", page);
URL url = new URL(urlBuilder.build().toString());
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("GET");
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
return body;
}
private String getBucket(String inputUurl) throws IOException {
URL url = new URL(inputUurl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("GET");
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
return zenodoModel.getLinks().getBucket();
}
}

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.common.api.zenodo;
public class Community {
private String identifier;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.common.api.zenodo;
public class Creator {
private String affiliation;
private String name;
private String orcid;
public String getAffiliation() {
return affiliation;
}
public void setAffiliation(String affiliation) {
this.affiliation = affiliation;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getOrcid() {
return orcid;
}
public void setOrcid(String orcid) {
this.orcid = orcid;
}
public static Creator newInstance(String name, String affiliation, String orcid) {
Creator c = new Creator();
if (name != null) {
c.name = name;
}
if (affiliation != null) {
c.affiliation = affiliation;
}
if (orcid != null) {
c.orcid = orcid;
}
return c;
}
}

View File

@ -0,0 +1,44 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class File implements Serializable {
private String checksum;
private String filename;
private long filesize;
private String id;
public String getChecksum() {
return checksum;
}
public void setChecksum(String checksum) {
this.checksum = checksum;
}
public String getFilename() {
return filename;
}
public void setFilename(String filename) {
this.filename = filename;
}
public long getFilesize() {
return filesize;
}
public void setFilesize(long filesize) {
this.filesize = filesize;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class Grant implements Serializable {
private String id;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public static Grant newInstance(String id) {
Grant g = new Grant();
g.id = id;
return g;
}
}

View File

@ -0,0 +1,92 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class Links implements Serializable {
private String bucket;
private String discard;
private String edit;
private String files;
private String html;
private String latest_draft;
private String latest_draft_html;
private String publish;
private String self;
public String getBucket() {
return bucket;
}
public void setBucket(String bucket) {
this.bucket = bucket;
}
public String getDiscard() {
return discard;
}
public void setDiscard(String discard) {
this.discard = discard;
}
public String getEdit() {
return edit;
}
public void setEdit(String edit) {
this.edit = edit;
}
public String getFiles() {
return files;
}
public void setFiles(String files) {
this.files = files;
}
public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getLatest_draft() {
return latest_draft;
}
public void setLatest_draft(String latest_draft) {
this.latest_draft = latest_draft;
}
public String getLatest_draft_html() {
return latest_draft_html;
}
public void setLatest_draft_html(String latest_draft_html) {
this.latest_draft_html = latest_draft_html;
}
public String getPublish() {
return publish;
}
public void setPublish(String publish) {
this.publish = publish;
}
public String getSelf() {
return self;
}
public void setSelf(String self) {
this.self = self;
}
}

View File

@ -0,0 +1,153 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import java.util.List;
public class Metadata implements Serializable {
private String access_right;
private List<Community> communities;
private List<Creator> creators;
private String description;
private String doi;
private List<Grant> grants;
private List<String> keywords;
private String language;
private String license;
private PrereserveDoi prereserve_doi;
private String publication_date;
private List<String> references;
private List<RelatedIdentifier> related_identifiers;
private String title;
private String upload_type;
private String version;
public String getUpload_type() {
return upload_type;
}
public void setUpload_type(String upload_type) {
this.upload_type = upload_type;
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public String getAccess_right() {
return access_right;
}
public void setAccess_right(String access_right) {
this.access_right = access_right;
}
public List<Community> getCommunities() {
return communities;
}
public void setCommunities(List<Community> communities) {
this.communities = communities;
}
public List<Creator> getCreators() {
return creators;
}
public void setCreators(List<Creator> creators) {
this.creators = creators;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public List<Grant> getGrants() {
return grants;
}
public void setGrants(List<Grant> grants) {
this.grants = grants;
}
public List<String> getKeywords() {
return keywords;
}
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public String getLicense() {
return license;
}
public void setLicense(String license) {
this.license = license;
}
public PrereserveDoi getPrereserve_doi() {
return prereserve_doi;
}
public void setPrereserve_doi(PrereserveDoi prereserve_doi) {
this.prereserve_doi = prereserve_doi;
}
public String getPublication_date() {
return publication_date;
}
public void setPublication_date(String publication_date) {
this.publication_date = publication_date;
}
public List<String> getReferences() {
return references;
}
public void setReferences(List<String> references) {
this.references = references;
}
public List<RelatedIdentifier> getRelated_identifiers() {
return related_identifiers;
}
public void setRelated_identifiers(List<RelatedIdentifier> related_identifiers) {
this.related_identifiers = related_identifiers;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class PrereserveDoi implements Serializable {
private String doi;
private String recid;
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public String getRecid() {
return recid;
}
public void setRecid(String recid) {
this.recid = recid;
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class RelatedIdentifier implements Serializable {
private String identifier;
private String relation;
private String resource_type;
private String scheme;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public String getRelation() {
return relation;
}
public void setRelation(String relation) {
this.relation = relation;
}
public String getResource_type() {
return resource_type;
}
public void setResource_type(String resource_type) {
this.resource_type = resource_type;
}
public String getScheme() {
return scheme;
}
public void setScheme(String scheme) {
this.scheme = scheme;
}
}

View File

@ -0,0 +1,118 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import java.util.List;
public class ZenodoModel implements Serializable {
private String conceptrecid;
private String created;
private List<File> files;
private String id;
private Links links;
private Metadata metadata;
private String modified;
private String owner;
private String record_id;
private String state;
private boolean submitted;
private String title;
public String getConceptrecid() {
return conceptrecid;
}
public void setConceptrecid(String conceptrecid) {
this.conceptrecid = conceptrecid;
}
public String getCreated() {
return created;
}
public void setCreated(String created) {
this.created = created;
}
public List<File> getFiles() {
return files;
}
public void setFiles(List<File> files) {
this.files = files;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Links getLinks() {
return links;
}
public void setLinks(Links links) {
this.links = links;
}
public Metadata getMetadata() {
return metadata;
}
public void setMetadata(Metadata metadata) {
this.metadata = metadata;
}
public String getModified() {
return modified;
}
public void setModified(String modified) {
this.modified = modified;
}
public String getOwner() {
return owner;
}
public void setOwner(String owner) {
this.owner = owner;
}
public String getRecord_id() {
return record_id;
}
public void setRecord_id(String record_id) {
this.record_id = record_id;
}
public String getState() {
return state;
}
public void setState(String state) {
this.state = state;
}
public boolean isSubmitted() {
return submitted;
}
public void setSubmitted(boolean submitted) {
this.submitted = submitted;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
}

View File

@ -0,0 +1,7 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.util.ArrayList;
public class ZenodoModelList extends ArrayList<ZenodoModel> {
}

View File

@ -0,0 +1,109 @@
package eu.dnetlib.dhp.common.api;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@Disabled
class ZenodoAPIClientTest {
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
private final String ACCESS_TOKEN = "";
private final String CONCEPT_REC_ID = "657113";
private final String depositionId = "674915";
@Test
void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
Assertions.assertEquals(200, client.sendMretadata(metadata));
Assertions.assertEquals(202, client.publish());
}
@Test
void testNewDeposition() throws IOException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(201, client.newDeposition());
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
Assertions.assertEquals(200, client.sendMretadata(metadata));
Assertions.assertEquals(202, client.publish());
}
@Test
void testNewVersionNewName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/newVersion")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
Assertions.assertEquals(202, client.publish());
}
@Test
void testNewVersionOldName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/newVersion2")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
Assertions.assertEquals(202, client.publish());
}
}

View File

@ -19,7 +19,6 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.base.BaseCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
@ -121,8 +120,6 @@ public class CollectorWorker extends ReportingJob {
return new FileCollectorPlugin(fileSystem);
case fileGzip:
return new FileGZipCollectorPlugin(fileSystem);
case baseDump:
return new BaseCollectorPlugin(this.fileSystem);
case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(api.getParams().get("other_plugin_type"))

View File

@ -10,8 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
public interface CollectorPlugin {
enum NAME {
oai, other, rest_json2xml, file, fileGzip, baseDump;
oai, other, rest_json2xml, file, fileGzip;
public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb

View File

@ -1,171 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.base;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.Iterator;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
public class BaseCollectorIterator implements Iterator<String> {
private String nextElement;
private final BlockingQueue<String> queue = new LinkedBlockingQueue<>(100);
private static final Logger log = LoggerFactory.getLogger(BaseCollectorIterator.class);
private static final String END_ELEM = "__END__";
public BaseCollectorIterator(final FileSystem fs, final Path filePath, final AggregatorReport report) {
new Thread(() -> importHadoopFile(fs, filePath, report)).start();
try {
this.nextElement = this.queue.take();
} catch (final InterruptedException e) {
throw new RuntimeException(e);
}
}
protected BaseCollectorIterator(final String resourcePath, final AggregatorReport report) {
new Thread(() -> importTestFile(resourcePath, report)).start();
try {
this.nextElement = this.queue.take();
} catch (final InterruptedException e) {
throw new RuntimeException(e);
}
}
@Override
public synchronized boolean hasNext() {
return (this.nextElement != null) & !END_ELEM.equals(this.nextElement);
}
@Override
public synchronized String next() {
try {
return END_ELEM.equals(this.nextElement) ? null : this.nextElement;
} finally {
try {
this.nextElement = this.queue.take();
} catch (final InterruptedException e) {
throw new RuntimeException(e);
}
}
}
private void importHadoopFile(final FileSystem fs, final Path filePath, final AggregatorReport report) {
log.info("I start to read the TAR stream");
try (InputStream origInputStream = fs.open(filePath);
final TarArchiveInputStream tarInputStream = new TarArchiveInputStream(origInputStream)) {
importTarStream(tarInputStream, report);
} catch (final Throwable e) {
throw new RuntimeException("Error processing BASE records", e);
}
}
private void importTestFile(final String resourcePath, final AggregatorReport report) {
try (final InputStream origInputStream = BaseCollectorIterator.class.getResourceAsStream(resourcePath);
final TarArchiveInputStream tarInputStream = new TarArchiveInputStream(origInputStream)) {
importTarStream(tarInputStream, report);
} catch (final Throwable e) {
throw new RuntimeException("Error processing BASE records", e);
}
}
private void importTarStream(final TarArchiveInputStream tarInputStream, final AggregatorReport report) {
long count = 0;
final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();
try {
TarArchiveEntry entry;
while ((entry = (TarArchiveEntry) tarInputStream.getNextEntry()) != null) {
final String name = entry.getName();
if (!entry.isDirectory() && name.contains("ListRecords") && name.endsWith(".bz2")) {
log.info("Processing file (BZIP): " + name);
final byte[] bzipData = new byte[(int) entry.getSize()];
IOUtils.readFully(tarInputStream, bzipData);
try (InputStream bzipIs = new ByteArrayInputStream(bzipData);
final BufferedInputStream bzipBis = new BufferedInputStream(bzipIs);
final CompressorInputStream bzipInput = new CompressorStreamFactory()
.createCompressorInputStream(bzipBis)) {
final XMLEventReader reader = xmlInputFactory.createXMLEventReader(bzipInput);
XMLEventWriter eventWriter = null;
StringWriter xmlWriter = null;
while (reader.hasNext()) {
final XMLEvent nextEvent = reader.nextEvent();
if (nextEvent.isStartElement()) {
final StartElement startElement = nextEvent.asStartElement();
if ("record".equals(startElement.getName().getLocalPart())) {
xmlWriter = new StringWriter();
eventWriter = xmlOutputFactory.createXMLEventWriter(xmlWriter);
}
}
if (eventWriter != null) {
eventWriter.add(nextEvent);
}
if (nextEvent.isEndElement()) {
final EndElement endElement = nextEvent.asEndElement();
if ("record".equals(endElement.getName().getLocalPart())) {
eventWriter.flush();
eventWriter.close();
this.queue.put(xmlWriter.toString());
eventWriter = null;
xmlWriter = null;
count++;
}
}
}
}
}
}
this.queue.put(END_ELEM); // TO INDICATE THE END OF THE QUEUE
} catch (final Throwable e) {
log.error("Error processing BASE records", e);
report.put(e.getClass().getName(), e.getMessage());
throw new RuntimeException("Error processing BASE records", e);
} finally {
log.info("Total records (written in queue): " + count);
}
}
}

View File

@ -1,159 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.base;
import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Optional;
import java.util.Set;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.AbstractSplittedRecordPlugin;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class BaseCollectorPlugin implements CollectorPlugin {
private final FileSystem fs;
private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class);
// MAPPING AND FILTERING ARE DEFINED HERE:
// https://docs.google.com/document/d/1Aj-ZAV11b44MCrAAUCPiS2TUlXb6PnJEu1utCMAcCOU/edit
public BaseCollectorPlugin(final FileSystem fs) {
this.fs = fs;
}
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
// the path of the dump file on HDFS
// http://oai.base-search.net/initial_load/base_oaipmh_dump-current.tar
// it could be downloaded from iis-cdh5-test-gw.ocean.icm.edu.pl and then copied on HDFS
final Path filePath = Optional
.ofNullable(api.getBaseUrl())
.map(Path::new)
.orElseThrow(() -> new CollectorException("missing baseUrl"));
// get the parameters for the connection to the OpenAIRE database.
// the database is used to obtain the list of the datasources that the plugin will collect
final String dbUrl = api.getParams().get("dbUrl");
final String dbUser = api.getParams().get("dbUser");
final String dbPassword = api.getParams().get("dbPassword");
// the types(comma separated, empty value for all) that the plugin will collect,
// the types should be expressed in the format of the normalized types of BASE (for example 1,121,...)
final String acceptedNormTypesString = api.getParams().get("acceptedNormTypes");
log.info("baseUrl: {}", filePath);
log.info("dbUrl: {}", dbUrl);
log.info("dbUser: {}", dbUser);
log.info("dbPassword: {}", "***");
log.info("acceptedNormTypes: {}", acceptedNormTypesString);
try {
if (!this.fs.exists(filePath)) {
throw new CollectorException("path does not exist: " + filePath);
}
} catch (final Throwable e) {
throw new CollectorException(e);
}
final Set<String> acceptedOpendoarIds = findAcceptedOpendoarIds(dbUrl, dbUser, dbPassword);
final Set<String> acceptedNormTypes = new HashSet<>();
if (StringUtils.isNotBlank(acceptedNormTypesString)) {
for (final String s : StringUtils.split(acceptedNormTypesString, ",")) {
if (StringUtils.isNotBlank(s)) {
acceptedNormTypes.add(s.trim());
}
}
}
final Iterator<String> iterator = new BaseCollectorIterator(this.fs, filePath, report);
final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
return StreamSupport
.stream(spliterator, false)
.filter(doc -> filterXml(doc, acceptedOpendoarIds, acceptedNormTypes));
}
private Set<String> findAcceptedOpendoarIds(final String dbUrl, final String dbUser, final String dbPassword)
throws CollectorException {
final Set<String> accepted = new HashSet<>();
try (final DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
final String sql = IOUtils
.toString(
getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql"));
dbClient.processResults(sql, row -> {
try {
final String dsId = row.getString("id");
log.info("Accepted Datasource: " + dsId);
accepted.add(dsId);
} catch (final SQLException e) {
log.error("Error in SQL", e);
throw new RuntimeException("Error in SQL", e);
}
});
} catch (final IOException e) {
log.error("Error accessong SQL", e);
throw new CollectorException("Error accessong SQL", e);
}
log.info("Accepted Datasources (TOTAL): " + accepted.size());
return accepted;
}
protected static boolean filterXml(final String xml,
final Set<String> acceptedOpendoarIds,
final Set<String> acceptedNormTypes) {
try {
final Document doc = DocumentHelper.parseText(xml);
final String id = doc.valueOf("//*[local-name()='collection']/@opendoar_id").trim();
if (StringUtils.isBlank(id) || !acceptedOpendoarIds.contains("opendoar____::" + id)) {
return false;
}
if (acceptedNormTypes.isEmpty()) {
return true;
}
for (final Object s : doc.selectNodes("//*[local-name()='typenorm']")) {
if (acceptedNormTypes.contains(((Node) s).getText().trim())) {
return true;
}
}
return false;
} catch (final DocumentException e) {
log.error("Error parsing document", e);
throw new RuntimeException("Error parsing document", e);
}
}
}

View File

@ -165,7 +165,7 @@ public class OaiIterator implements Iterator<String> {
} catch (final DocumentException e1) {
final String resumptionToken = extractResumptionToken(xml);
if (resumptionToken == null) {
report.put(e1.getClass().getName(), e1.getMessage());
report.put(e1.getClass().getName(), e1.getMessage());
throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1);
}
return resumptionToken;

View File

@ -1,114 +0,0 @@
BEGIN;
INSERT INTO dsm_services(
_dnet_resource_identifier_,
id,
officialname,
englishname,
namespaceprefix,
websiteurl,
logourl,
platform,
contactemail,
collectedfrom,
provenanceaction,
_typology_to_remove_,
eosc_type,
eosc_datasource_type,
research_entity_types,
thematic
) VALUES (
'openaire____::base_search',
'openaire____::base_search',
'Bielefeld Academic Search Engine (BASE)',
'Bielefeld Academic Search Engine (BASE)',
'base_search_',
'https://www.base-search.net',
'https://www.base-search.net/about/download/logo_224x57_white.gif',
'BASE',
'openaire-helpdesk@uni-bielefeld.de',
'infrastruct_::openaire',
'user:insert',
'aggregator::pubsrepository::unknown',
'Data Source',
'Aggregator',
ARRAY['Research Products'],
false
);
INSERT INTO dsm_service_organization(
_dnet_resource_identifier_,
organization,
service
) VALUES (
'fairsharing_::org::214@@openaire____::base_search',
'fairsharing_::org::214',
'openaire____::base_search'
);
INSERT INTO dsm_api(
_dnet_resource_identifier_,
id,
service,
protocol,
baseurl,
metadata_identifier_path
) VALUES (
'api_________::openaire____::base_search::dump',
'api_________::openaire____::base_search::dump',
'openaire____::base_search',
'baseDump',
'/user/michele.artini/base-import/base_oaipmh_dump-current.tar',
'//*[local-name()=''header'']/*[local-name()=''identifier'']'
);
INSERT INTO dsm_apiparams(
_dnet_resource_identifier_,
api,
param,
value
) VALUES (
'api_________::openaire____::base_search::dump@@dbUrl',
'api_________::openaire____::base_search::dump',
'dbUrl',
'jdbc:postgresql://postgresql.services.openaire.eu:5432/dnet_openaireplus'
);
INSERT INTO dsm_apiparams(
_dnet_resource_identifier_,
api,
param,
value
) VALUES (
'api_________::openaire____::base_search::dump@@dbUser',
'api_________::openaire____::base_search::dump',
'dbUser',
'dnet'
);
INSERT INTO dsm_apiparams(
_dnet_resource_identifier_,
api,
param,
value
) VALUES (
'api_________::openaire____::base_search::dump@@dbPassword',
'api_________::openaire____::base_search::dump',
'dbPassword',
'***'
);
INSERT INTO dsm_apiparams(
_dnet_resource_identifier_,
api,
param,
value
) VALUES (
'api_________::openaire____::base_search::dump@@acceptedNormTypes',
'api_________::openaire____::base_search::dump',
'acceptedNormTypes',
'1,11,111,121,13,14,15,18,181,182,183,1A,6,7'
);
COMMIT;

View File

@ -1,9 +0,0 @@
select s.id as id
from dsm_services s
where collectedfrom = 'openaire____::opendoar'
and jurisdiction = 'Institutional'
and s.id in (
select service from dsm_api where coalesce(compatibility_override, compatibility) = 'driver' or coalesce(compatibility_override, compatibility) = 'UNKNOWN'
) and s.id not in (
select service from dsm_api where coalesce(compatibility_override, compatibility) like '%openaire%'
);

View File

@ -1,11 +0,0 @@
select
s.id as id,
s.jurisdiction as jurisdiction,
array_remove(array_agg(a.id || ' (compliance: ' || coalesce(a.compatibility_override, a.compatibility, 'UNKNOWN') || ')@@@' || coalesce(a.last_collection_total, 0)), NULL) as aggregations
from
dsm_services s
join dsm_api a on (s.id = a.service)
where
collectedfrom = 'openaire____::opendoar'
group by
s.id;

View File

@ -1,180 +0,0 @@
<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value="c67911d6-9988-4a3b-b965-7d39bdd4a31d_Vm9jYWJ1bGFyeURTUmVzb3VyY2VzL1ZvY2FidWxhcnlEU1Jlc291cmNlVHlwZQ==" />
<RESOURCE_TYPE value="VocabularyDSResourceType" />
<RESOURCE_KIND value="VocabularyDSResources" />
<RESOURCE_URI value="" />
<DATE_OF_CREATION value="2024-02-13T11:15:48+00:00" />
</HEADER>
<BODY>
<CONFIGURATION>
<VOCABULARY_NAME code="base:normalized_types">base:normalized_types</VOCABULARY_NAME>
<VOCABULARY_DESCRIPTION>base:normalized_types</VOCABULARY_DESCRIPTION>
<TERMS>
<TERM native_name="Text" code="Text" english_name="Text" encoding="BASE">
<SYNONYMS>
<SYNONYM term="1" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Book" code="Book" english_name="Book" encoding="BASE">
<SYNONYMS>
<SYNONYM term="11" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Book part" code="Book part" english_name="Book part" encoding="BASE">
<SYNONYMS>
<SYNONYM term="111" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Journal/Newspaper" code="Journal/Newspaper" english_name="Journal/Newspaper" encoding="BASE">
<SYNONYMS>
<SYNONYM term="12" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Article contribution" code="Article contribution" english_name="Article contribution" encoding="BASE">
<SYNONYMS>
<SYNONYM term="121" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Other non-article" code="Other non-article" english_name="Other non-article" encoding="BASE">
<SYNONYMS>
<SYNONYM term="122" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Conference object" code="Conference object" english_name="Conference object" encoding="BASE">
<SYNONYMS>
<SYNONYM term="13" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Report" code="Report" english_name="Report" encoding="BASE">
<SYNONYMS>
<SYNONYM term="14" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Review" code="Review" english_name="Review" encoding="BASE">
<SYNONYMS>
<SYNONYM term="15" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Course material" code="Course material" english_name="Course material" encoding="BASE">
<SYNONYMS>
<SYNONYM term="16" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Lecture" code="Lecture" english_name="Lecture" encoding="BASE">
<SYNONYMS>
<SYNONYM term="17" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Thesis" code="Thesis" english_name="Thesis" encoding="BASE">
<SYNONYMS>
<SYNONYM term="18" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Bachelor's thesis" code="Bachelor's thesis" english_name="Bachelor's thesis" encoding="BASE">
<SYNONYMS>
<SYNONYM term="181" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Master's thesis" code="Master's thesis" english_name="Master's thesis" encoding="BASE">
<SYNONYMS>
<SYNONYM term="182" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Doctoral and postdoctoral thesis" code="Doctoral and postdoctoral thesis" english_name="Doctoral and postdoctoral thesis" encoding="BASE">
<SYNONYMS>
<SYNONYM term="183" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Manuscript" code="Manuscript" english_name="Manuscript" encoding="BASE">
<SYNONYMS>
<SYNONYM term="19" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Patent" code="Patent" english_name="Patent" encoding="BASE">
<SYNONYMS>
<SYNONYM term="1A" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Musical notation" code="Musical notation" english_name="Musical notation" encoding="BASE">
<SYNONYMS>
<SYNONYM term="2" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Map" code="Map" english_name="Map" encoding="BASE">
<SYNONYMS>
<SYNONYM term="3" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Audio" code="Audio" english_name="Audio" encoding="BASE">
<SYNONYMS>
<SYNONYM term="4" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Image/Video" code="Image/Video" english_name="Image/Video" encoding="BASE">
<SYNONYMS>
<SYNONYM term="5" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Still image" code="Still image" english_name="Still image" encoding="BASE">
<SYNONYMS>
<SYNONYM term="51" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Moving image/Video" code="Moving image/Video" english_name="Moving image/Video" encoding="BASE">
<SYNONYMS>
<SYNONYM term="52" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Software" code="Software" english_name="Software" encoding="BASE">
<SYNONYMS>
<SYNONYM term="6" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Dataset" code="Dataset" english_name="Dataset" encoding="BASE">
<SYNONYMS>
<SYNONYM term="7" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Unknown" code="Unknown" english_name="Unknown" encoding="BASE">
<SYNONYMS>
<SYNONYM term="F" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
</TERMS>
</CONFIGURATION>
<STATUS>
<LAST_UPDATE value="2013-11-18T10:46:36Z" />
</STATUS>
<SECURITY_PARAMETERS>String</SECURITY_PARAMETERS>
</BODY>
</RESOURCE_PROFILE>

View File

@ -1,298 +0,0 @@
<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value="" />
<RESOURCE_TYPE value="TransformationRuleDSResourceType" />
<RESOURCE_KIND value="TransformationRuleDSResources" />
<RESOURCE_URI value="" />
<DATE_OF_CREATION value="2024-03-05T11:23:00+00:00" />
</HEADER>
<BODY>
<CONFIGURATION>
<SOURCE_METADATA_FORMAT interpretation="cleaned" layout="store" name="dc" />
<SINK_METADATA_FORMAT name="oaf_hbase" />
<IMPORTED />
<SCRIPT>
<TITLE>xslt_base2oaf_hadoop</TITLE>
<CODE>
<xsl:stylesheet xmlns:oaire="http://namespace.openaire.eu/schema/oaire/" xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
xmlns:base_dc="http://oai.base-search.net/base_dc/"
xmlns:datacite="http://datacite.org/schema/kernel-4" xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:vocabulary="http://eu/dnetlib/transform/clean" xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:dc="http://purl.org/dc/elements/1.1/"
exclude-result-prefixes="xsl vocabulary dateCleaner base_dc" version="2.0">
<xsl:param name="varOfficialName" />
<xsl:param name="varDataSourceId" />
<xsl:param name="varFP7" select="'corda_______::'" />
<xsl:param name="varH2020" select="'corda__h2020::'" />
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')" />
<xsl:param name="index" select="0" />
<xsl:param name="transDate" select="current-dateTime()" />
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template match="/">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
<!-- TO EVALUATE
base_dc:authod_id
base_dc:authod_id/base_dc:creator_id
base_dc:authod_id/base_dc:creator_name
example:
<dc:creator>ALBU, Svetlana</dc:creator>
<base_dc:authod_id>
<base_dc:creator_name>ALBU, Svetlana</base_dc:creator_name>
<base_dc:creator_id>https://orcid.org/0000-0002-8648-950X</base_dc:creator_id>
</base_dc:authod_id>
-->
<!-- NOT USED
base_dc:global_id (I used oai:identifier)
base_dc:collection/text()
base_dc:continent
base_dc:country
base_dc:year (I used dc:date)
dc:coverage
dc:language (I used base_dc:lang)
base_dc:link (I used dc:identifier)
-->
<xsl:variable name="varBaseNormType" select="vocabulary:clean(//base_dc:typenorm, 'base:normalized_types')" />
<metadata>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:title" />
<xsl:with-param name="targetElement" select="'dc:title'" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:creator/replace(., '^(.*)\|.*$', '$1')" />
<xsl:with-param name="targetElement" select="'dc:creator'" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:contributor" />
<xsl:with-param name="targetElement" select="'dc:contributor'" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:description" />
<xsl:with-param name="targetElement" select="'dc:description'" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:subject" />
<xsl:with-param name="targetElement" select="'dc:subject'" />
</xsl:call-template>
<!-- TODO: I'm not sure if this is the correct encoding -->
<xsl:for-each select="//base_dc:classcode|//base_dc:autoclasscode">
<dc:subject><xsl:value-of select="concat(@type, ':', .)" /></dc:subject>
</xsl:for-each>
<!-- END TODO -->
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:publisher" />
<xsl:with-param name="targetElement" select="'dc:publisher'" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:format" />
<xsl:with-param name="targetElement" select="'dc:format'" />
</xsl:call-template>
<dc:type>
<xsl:value-of select="$varBaseNormType" />
</dc:type>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:type" />
<xsl:with-param name="targetElement" select="'dc:type'" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:source" />
<xsl:with-param name="targetElement" select="'dc:source'" />
</xsl:call-template>
<dc:language>
<xsl:value-of select="vocabulary:clean( //base_dc:lang, 'dnet:languages')" />
</dc:language>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:rights" />
<xsl:with-param name="targetElement" select="'dc:rights'" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:relation" />
<xsl:with-param name="targetElement" select="'dc:relation'" />
</xsl:call-template>
<xsl:if test="not(//dc:identifier[starts-with(., 'http')])">
<xsl:call-template name="terminate" />
</xsl:if>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:identifier[starts-with(., 'http')]" />
<xsl:with-param name="targetElement" select="'dc:identifier'" />
</xsl:call-template>
<xsl:for-each select="//dc:relation">
<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')">
<oaf:projectid>
<xsl:value-of select="concat($varFP7, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
</oaf:projectid>
</xsl:if>
<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<dr:CobjCategory>
<xsl:variable name="varCobjCategory" select="vocabulary:clean($varBaseNormType, 'dnet:publication_resource')" />
<xsl:variable name="varSuperType" select="vocabulary:clean($varCobjCategory, 'dnet:result_typologies')" />
<xsl:attribute name="type" select="$varSuperType" />
<xsl:value-of select="$varCobjCategory" />
</dr:CobjCategory>
<oaf:accessrights>
<xsl:choose>
<xsl:when test="//base_dc:oa[.='1']">OPEN</xsl:when>
<xsl:when test="//base_dc:rightsnorm">
<xsl:value-of select="vocabulary:clean(//base_dc:rightsnorm, 'dnet:access_modes')" />
</xsl:when>
<xsl:when test="//dc:rights">
<xsl:value-of select="vocabulary:clean( //dc:rights, 'dnet:access_modes')" />
</xsl:when>
<xsl:otherwise>UNKNOWN</xsl:otherwise>
</xsl:choose>
</oaf:accessrights>
<xsl:for-each select="//base_dc:doi">
<oaf:identifier identifierType="doi">
<xsl:value-of select="." />
</oaf:identifier>
</xsl:for-each>
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and (not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
<oaf:identifier identifierType="url">
<xsl:value-of select="." />
</oaf:identifier>
</xsl:for-each>
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
<oaf:identifier identifierType="handle">
<xsl:value-of select="." />
</oaf:identifier>
</xsl:for-each>
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
<oaf:identifier identifierType='urn'>
<xsl:value-of select="." />
</oaf:identifier>
</xsl:for-each>
<oaf:identifier identifierType="oai-original">
<xsl:value-of
select="//*[local-name() = 'about']/*[local-name() = 'provenance']//*[local-name() = 'originDescription' and not(./*[local-name() = 'originDescription'])]/*[local-name() = 'identifier']" />
</oaf:identifier>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="//base_dc:collname" />
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="concat('opendoar____::', //base_dc:collection/@opendoar_id)" />
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName" />
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId" />
</xsl:attribute>
</oaf:collectedFrom>
<oaf:dateAccepted>
<xsl:value-of select="dateCleaner:dateISO( //dc:date[1] )" />
</oaf:dateAccepted>
<xsl:if test="//base_dc:oa[.='1']">
<xsl:for-each select="//dc:relation[starts-with(., 'http')]">
<oaf:fulltext>
<xsl:value-of select="normalize-space(.)" />
</oaf:fulltext>
</xsl:for-each>
</xsl:if>
<xsl:for-each select="//base_dc:collection/@ror_id">
<oaf:relation relType="resultOrganization"
subRelType="affiliation"
relClass="hasAuthorInstitution"
targetType="organization">
<xsl:choose>
<xsl:when test="contains(.,'https://ror.org/')">
<xsl:value-of select="concat('ror_________::', normalize-space(.))" />
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="concat('ror_________::https://ror.org/', normalize-space(.))" />
</xsl:otherwise>
</xsl:choose>
</oaf:relation>
</xsl:for-each>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<xsl:template name="allElements">
<xsl:param name="sourceElement" />
<xsl:param name="targetElement" />
<xsl:for-each select="$sourceElement">
<xsl:element name="{$targetElement}">
<xsl:value-of select="normalize-space(.)" />
</xsl:element>
</xsl:for-each>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:if test="//oai:header/@status='deleted'">
<xsl:call-template name="terminate" />
</xsl:if>
<xsl:copy>
<xsl:apply-templates select="node()|@*" />
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate" />
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*" />
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
</CODE>
</SCRIPT>
</CONFIGURATION>
<STATUS />
<SECURITY_PARAMETERS />
</BODY>
</RESOURCE_PROFILE>

View File

@ -1,322 +0,0 @@
<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value="2ad0cdd9-c96c-484c-8b0e-ed56d86891fe_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU=" />
<RESOURCE_TYPE value="TransformationRuleDSResourceType" />
<RESOURCE_KIND value="TransformationRuleDSResources" />
<RESOURCE_URI value="" />
<DATE_OF_CREATION value="2024-03-05T11:23:00+00:00" />
</HEADER>
<BODY>
<CONFIGURATION>
<SOURCE_METADATA_FORMAT interpretation="cleaned" layout="store" name="dc" />
<SINK_METADATA_FORMAT name="odf_hbase" />
<IMPORTED />
<SCRIPT>
<TITLE>xslt_base2odf_hadoop</TITLE>
<CODE>
<xsl:stylesheet xmlns:oaire="http://namespace.openaire.eu/schema/oaire/" xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO" xmlns:base_dc="http://oai.base-search.net/base_dc/"
xmlns:datacite="http://datacite.org/schema/kernel-4" xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:vocabulary="http://eu/dnetlib/transform/clean" xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:dc="http://purl.org/dc/elements/1.1/"
exclude-result-prefixes="xsl vocabulary dateCleaner base_dc" version="2.0">
<xsl:param name="varOfficialName" />
<xsl:param name="varDataSourceId" />
<xsl:param name="varFP7" select="'corda_______::'" />
<xsl:param name="varH2020" select="'corda__h2020::'" />
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')" />
<xsl:param name="index" select="0" />
<xsl:param name="transDate" select="current-dateTime()" />
<xsl:template name="terminate">
<xsl:message terminate="yes">
record is not compliant, transformation is interrupted.
</xsl:message>
</xsl:template>
<xsl:template match="/">
<record>
<xsl:apply-templates select="//*[local-name() = 'header']" />
<!-- NOT USED
base_dc:global_id (I used oai:identifier)
base_dc:collection/text()
base_dc:continent
base_dc:country
dc:coverage
dc:source
dc:relation
dc:type (I used //base_dc:typenorm)
dc:language (I used base_dc:lang)
base_dc:link (I used dc:identifier)
-->
<xsl:variable name="varBaseNormType" select="vocabulary:clean(//base_dc:typenorm, 'base:normalized_types')" />
<metadata>
<datacite:resource>
<xsl:for-each select="//base_dc:doi">
<datacite:identifier identifierType="DOI">
<xsl:value-of select="." />
</datacite:identifier>
</xsl:for-each>
<datacite:alternateIdentifiers>
<xsl:for-each
select="distinct-values(//dc:identifier[starts-with(., 'http') and (not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
<datacite:identifier alternateIdentifierType="url">
<xsl:value-of select="." />
</datacite:identifier>
</xsl:for-each>
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
<datacite:identifier alternateIdentifierType="handle">
<xsl:value-of select="." />
</datacite:identifier>
</xsl:for-each>
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
<datacite:identifier alternateIdentifierType='urn'>
<xsl:value-of select="." />
</datacite:identifier>
</xsl:for-each>
<datacite:identifier alternateIdentifierType="oai-original">
<xsl:value-of
select="//*[local-name() = 'about']/*[local-name() = 'provenance']//*[local-name() = 'originDescription' and not(./*[local-name() = 'originDescription'])]/*[local-name() = 'identifier']" />
</datacite:identifier>
</datacite:alternateIdentifiers>
<datacite:relatedIdentifiers />
<datacite:resourceType><xsl:value-of select="$varBaseNormType" /></datacite:resourceType>
<datacite:titles>
<xsl:for-each select="//dc:title">
<datacite:title>
<xsl:value-of select="normalize-space(.)" />
</datacite:title>
</xsl:for-each>
</datacite:titles>
<datacite:creators>
<xsl:for-each select="//dc:creator">
<xsl:variable name="author" select="normalize-space(.)" />
<datacite:creator>
<datacite:creatorName>
<xsl:value-of select="$author" />
</datacite:creatorName>
<xsl:for-each select="//base_dc:authod_id[normalize-space(./base_dc:creator_name) = $author]/base_dc:creator_id ">
<xsl:if test="contains(.,'https://orcid.org/')">
<nameIdentifier schemeURI="https://orcid.org/" nameIdentifierScheme="ORCID">
<xsl:value-of select="substring-after(., 'https://orcid.org/')" />
</nameIdentifier>
</xsl:if>
</xsl:for-each>
</datacite:creator>
</xsl:for-each>
</datacite:creators>
<datacite:contributors>
<xsl:for-each select="//dc:contributor">
<datacite:contributor>
<datacite:contributorName>
<xsl:value-of select="normalize-space(.)" />
</datacite:contributorName>
</datacite:contributor>
</xsl:for-each>
</datacite:contributors>
<datacite:descriptions>
<xsl:for-each select="//dc:description">
<datacite:description descriptionType="Abstract">
<xsl:value-of select="normalize-space(.)" />
</datacite:description>
</xsl:for-each>
</datacite:descriptions>
<datacite:subjects>
<xsl:for-each select="//dc:subject">
<datacite:subject>
<xsl:value-of select="normalize-space(.)" />
</datacite:subject>
</xsl:for-each>
<xsl:for-each select="//base_dc:classcode|//base_dc:autoclasscode">
<datacite:subject subjectScheme="{@type}" classificationCode="{normalize-space(.)}">
<!-- TODO the value should be obtained by the Code -->
<xsl:value-of select="normalize-space(.)" />
</datacite:subject>
</xsl:for-each>
</datacite:subjects>
<datacite:publisher>
<xsl:value-of select="normalize-space(//dc:publisher)" />
</datacite:publisher>
<datacite:publicationYear>
<xsl:value-of select="normalize-space(//base_dc:year)" />
</datacite:publicationYear>
<datacite:formats>
<xsl:for-each select="//dc:format">
<datacite:format>
<xsl:value-of select="normalize-space(.)" />
</datacite:format>
</xsl:for-each>
</datacite:formats>
<datacite:language>
<xsl:value-of select="vocabulary:clean( //base_dc:lang, 'dnet:languages')" />
</datacite:language>
<oaf:accessrights>
<xsl:if test="//base_dc:oa[.='1']">
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
</xsl:if>
<xsl:for-each select="//dc:rights|//base_dc:rightsnorm">
<datacite:rights><xsl:value-of select="vocabulary:clean(., 'dnet:access_modes')" /></datacite:rights>
</xsl:for-each>
</oaf:accessrights>
</datacite:resource>
<xsl:for-each select="//dc:relation">
<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')">
<oaf:projectid>
<xsl:value-of select="concat($varFP7, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
</oaf:projectid>
</xsl:if>
<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<dr:CobjCategory>
<xsl:variable name="varCobjCategory" select="vocabulary:clean($varBaseNormType, 'dnet:publication_resource')" />
<xsl:variable name="varSuperType" select="vocabulary:clean($varCobjCategory, 'dnet:result_typologies')" />
<xsl:attribute name="type" select="$varSuperType" />
<xsl:value-of select="$varCobjCategory" />
</dr:CobjCategory>
<oaf:accessrights>
<xsl:choose>
<xsl:when test="//base_dc:oa[.='1']">OPEN</xsl:when>
<xsl:when test="//base_dc:rightsnorm">
<xsl:value-of select="vocabulary:clean(//base_dc:rightsnorm, 'dnet:access_modes')" />
</xsl:when>
<xsl:when test="//dc:rights">
<xsl:value-of select="vocabulary:clean( //dc:rights, 'dnet:access_modes')" />
</xsl:when>
<xsl:otherwise>UNKNOWN</xsl:otherwise>
</xsl:choose>
</oaf:accessrights>
<xsl:for-each select="//base_dc:doi">
<oaf:identifier identifierType="doi">
<xsl:value-of select="." />
</oaf:identifier>
</xsl:for-each>
<xsl:for-each
select="distinct-values(//dc:identifier[starts-with(., 'http') and ( not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
<oaf:identifier identifierType="url">
<xsl:value-of select="." />
</oaf:identifier>
</xsl:for-each>
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
<oaf:identifier identifierType="handle">
<xsl:value-of select="." />
</oaf:identifier>
</xsl:for-each>
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
<oaf:identifier identifierType='urn'>
<xsl:value-of select="." />
</oaf:identifier>
</xsl:for-each>
<oaf:identifier identifierType="oai-original">
<xsl:value-of
select="//*[local-name() = 'about']/*[local-name() = 'provenance']//*[local-name() = 'originDescription' and not(./*[local-name() = 'originDescription'])]/*[local-name() = 'identifier']" />
</oaf:identifier>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="//base_dc:collname" />
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="concat('opendoar____::', //base_dc:collection/@opendoar_id)" />
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName" />
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId" />
</xsl:attribute>
</oaf:collectedFrom>
<oaf:dateAccepted>
<xsl:value-of select="dateCleaner:dateISO( //dc:date[1] )" />
</oaf:dateAccepted>
<xsl:if test="//base_dc:oa[.='1']">
<xsl:for-each select="//dc:relation[starts-with(., 'http')]">
<oaf:fulltext>
<xsl:value-of select="normalize-space(.)" />
</oaf:fulltext>
</xsl:for-each>
</xsl:if>
<xsl:for-each select="//base_dc:collection/@ror_id">
<oaf:relation relType="resultOrganization" subRelType="affiliation" relClass="hasAuthorInstitution" targetType="organization">
<xsl:choose>
<xsl:when test="contains(.,'https://ror.org/')">
<xsl:value-of select="concat('ror_________::', normalize-space(.))" />
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="concat('ror_________::https://ror.org/', normalize-space(.))" />
</xsl:otherwise>
</xsl:choose>
</oaf:relation>
</xsl:for-each>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:if test="//oai:header/@status='deleted'">
<xsl:call-template name="terminate" />
</xsl:if>
<xsl:copy>
<xsl:apply-templates select="node()|@*" />
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate" />
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*" />
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
</CODE>
</SCRIPT>
</CONFIGURATION>
<STATUS />
<SECURITY_PARAMETERS />
</BODY>
</RESOURCE_PROFILE>

View File

@ -1048,10 +1048,5 @@
"openaire_id": "re3data_____::r3d100010399",
"datacite_name": "ZEW Forschungsdatenzentrum",
"official_name": "ZEW Forschungsdatenzentrum"
},
"HBP.NEUROINF": {
"openaire_id": "fairsharing_::2975",
"datacite_name": "EBRAINS",
"official_name": "EBRAINS"
}
}

View File

@ -1,4 +1,4 @@
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
@ -8,40 +8,19 @@
<name>database</name>
<description>the PDB Database Working Path</description>
</property>
<property>
<name>mdStoreOutputId</name>
<description>the identifier of the cleaned MDStore</description>
</property>
<property>
<name>mdStoreManagerURI</name>
<description>the path of the cleaned mdstore</description>
<name>targetPath</name>
<description>the Target Working dir path</description>
</property>
</parameters>
<start to="StartTransaction"/>
<start to="ConvertDB"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="StartTransaction">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>NEW_VERSION</arg>
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<capture-output/>
</java>
<ok to="ConvertDB"/>
<error to="RollBack"/>
</action>
<action name="ConvertDB">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -62,48 +41,11 @@
<arg>--master</arg><arg>yarn</arg>
<arg>--dbPath</arg><arg>${sourcePath}</arg>
<arg>--database</arg><arg>${database}</arg>
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
</spark>
<ok to="CommitVersion"/>
<error to="RollBack"/>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="CommitVersion">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>COMMIT</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="RollBack">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>ROLLBACK</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="Kill"/>
<error to="Kill"/>
</action>
<end name="End"/>
<end name="End"/>
</workflow-app>

View File

@ -2,5 +2,5 @@
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true},
{"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true},
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true}
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the OAF target path ", "paramRequired": true}
]

View File

@ -1,20 +1,5 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source Path",
"paramRequired": true
},
{
"paramName": "mo",
"paramLongName": "mdstoreOutputVersion",
"paramDescription": "the oaf path ",
"paramRequired": true
}
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath","paramDescription": "the oaf path ", "paramRequired": true}
]

View File

@ -9,26 +9,34 @@
<description>the Working Path</description>
</property>
<property>
<name>mdStoreOutputId</name>
<description>the identifier of the cleaned MDStore</description>
<name>targetPath</name>
<description>the OAF MDStore Path</description>
</property>
<property>
<name>mdStoreManagerURI</name>
<description>the path of the cleaned mdstore</description>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>resumeFrom</name>
<value>CreateEBIDataSet</value>
<value>DownloadEBILinks</value>
<description>node to start</description>
</property>
</parameters>
<start to="StartTransaction"/>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
<case to="StartTransaction">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
<case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
<default to="DownloadEBILinks"/>
</switch>
</decision>
@ -69,29 +77,9 @@
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
</fs>
<ok to="StartTransaction"/>
<ok to="CreateEBIDataSet"/>
<error to="Kill"/>
</action>
<action name="StartTransaction">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>NEW_VERSION</arg>
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<capture-output/>
</java>
<ok to="CreateEBIDataSet"/>
<error to="RollBack"/>
</action>
<action name="CreateEBIDataSet">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
@ -107,49 +95,11 @@
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="CommitVersion">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>COMMIT</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="RollBack">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>ROLLBACK</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="Kill"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -231,7 +231,7 @@ object BioDBToOAF {
def uniprotToOAF(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val pid = (json \ "pid").extract[String].trim()
val pid = (json \ "pid").extract[String]
val d = new Dataset

View File

@ -2,15 +2,12 @@ package eu.dnetlib.dhp.sx.bio
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
object SparkTransformBioDatabaseToOAF {
@ -28,13 +25,8 @@ object SparkTransformBioDatabaseToOAF {
val dbPath: String = parser.get("dbPath")
log.info("dbPath: {}", database)
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info("outputBasePath: {}", outputBasePath)
val targetPath: String = parser.get("targetPath")
log.info("targetPath: {}", database)
val spark: SparkSession =
SparkSession
@ -51,28 +43,24 @@ object SparkTransformBioDatabaseToOAF {
case "UNIPROT" =>
CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
s"$outputBasePath/$MDSTORE_DATA_PATH"
targetPath
)
case "PDB" =>
CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
s"$outputBasePath/$MDSTORE_DATA_PATH"
targetPath
)
case "SCHOLIX" =>
CollectionUtils.saveDataset(
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
s"$outputBasePath/$MDSTORE_DATA_PATH"
targetPath
)
case "CROSSREF_LINKS" =>
CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
s"$outputBasePath/$MDSTORE_DATA_PATH"
targetPath
)
}
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
val mdStoreSize = df.count
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
}
}

View File

@ -9,9 +9,6 @@ import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
object SparkEBILinksToOaf {
@ -35,13 +32,8 @@ object SparkEBILinksToOaf {
import spark.implicits._
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info("outputBasePath: {}", outputBasePath)
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
val ebLinks: Dataset[EBILinkItem] = spark.read
@ -54,10 +46,7 @@ object SparkEBILinksToOaf {
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
s"$outputBasePath/$MDSTORE_DATA_PATH"
targetPath
)
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
val mdStoreSize = df.count
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
}
}

View File

@ -1,38 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.base;
import java.io.Serializable;
public class BaseCollectionInfo implements Serializable {
private static final long serialVersionUID = 5766333937429419647L;
private String id;
private String opendoarId;
private String rorId;
public String getId() {
return this.id;
}
public void setId(final String id) {
this.id = id;
}
public String getOpendoarId() {
return this.opendoarId;
}
public void setOpendoarId(final String opendoarId) {
this.opendoarId = opendoarId;
}
public String getRorId() {
return this.rorId;
}
public void setRorId(final String rorId) {
this.rorId = rorId;
}
}

View File

@ -1,184 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.base;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Attribute;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.Node;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@Disabled
public class BaseCollectorIteratorTest {
@Test
void testImportFile() throws Exception {
long count = 0;
final BaseCollectorIterator iterator = new BaseCollectorIterator("base-sample.tar", new AggregatorReport());
final Map<String, Map<String, String>> collections = new HashMap<>();
final Map<String, AtomicInteger> fields = new HashMap<>();
final Set<String> types = new HashSet<>();
while (iterator.hasNext()) {
final Document record = DocumentHelper.parseText(iterator.next());
count++;
if ((count % 1000) == 0) {
System.out.println("# Read records: " + count);
}
// System.out.println(record.asXML());
for (final Object o : record.selectNodes("//*|//@*")) {
final String path = ((Node) o).getPath();
if (fields.containsKey(path)) {
fields.get(path).incrementAndGet();
} else {
fields.put(path, new AtomicInteger(1));
}
if (o instanceof Element) {
final Element n = (Element) o;
if ("collection".equals(n.getName())) {
final String collName = n.getText().trim();
if (StringUtils.isNotBlank(collName) && !collections.containsKey(collName)) {
final Map<String, String> collAttrs = new HashMap<>();
for (final Object ao : n.attributes()) {
collAttrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue());
}
collections.put(collName, collAttrs);
}
} else if ("type".equals(n.getName())) {
types.add(n.getText().trim());
}
}
}
}
final ObjectMapper mapper = new ObjectMapper();
for (final Entry<String, Map<String, String>> e : collections.entrySet()) {
System.out.println(e.getKey() + ": " + mapper.writeValueAsString(e.getValue()));
}
for (final Entry<String, AtomicInteger> e : fields.entrySet()) {
System.out.println(e.getKey() + ": " + e.getValue().get());
}
System.out.println("TYPES: ");
for (final String s : types) {
System.out.println(s);
}
assertEquals(30000, count);
}
@Test
public void testParquet() throws Exception {
final String xml = IOUtils.toString(getClass().getResourceAsStream("record.xml"));
final SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
final List<BaseRecordInfo> ls = new ArrayList<>();
for (int i = 0; i < 10; i++) {
ls.add(extractInfo(xml));
}
final JavaRDD<BaseRecordInfo> rdd = JavaSparkContext
.fromSparkContext(spark.sparkContext())
.parallelize(ls);
final Dataset<BaseRecordInfo> df = spark
.createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class));
df.printSchema();
df.show(false);
}
private BaseRecordInfo extractInfo(final String s) {
try {
final Document record = DocumentHelper.parseText(s);
final BaseRecordInfo info = new BaseRecordInfo();
final Set<String> paths = new LinkedHashSet<>();
final Set<String> types = new LinkedHashSet<>();
final List<BaseCollectionInfo> colls = new ArrayList<>();
for (final Object o : record.selectNodes("//*|//@*")) {
paths.add(((Node) o).getPath());
if (o instanceof Element) {
final Element n = (Element) o;
final String nodeName = n.getName();
if ("collection".equals(nodeName)) {
final String collName = n.getText().trim();
if (StringUtils.isNotBlank(collName)) {
final BaseCollectionInfo coll = new BaseCollectionInfo();
coll.setId(collName);
coll.setOpendoarId(n.valueOf("@opendoar_id").trim());
coll.setRorId(n.valueOf("@ror_id").trim());
colls.add(coll);
}
} else if ("type".equals(nodeName)) {
types.add("TYPE: " + n.getText().trim());
} else if ("typenorm".equals(nodeName)) {
types.add("TYPE_NORM: " + n.getText().trim());
}
}
}
info.setId(record.valueOf("//*[local-name() = 'header']/*[local-name() = 'identifier']").trim());
info.getTypes().addAll(types);
info.getPaths().addAll(paths);
info.setCollections(colls);
return info;
} catch (final DocumentException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -1,32 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.base;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
class BaseCollectorPluginTest {
@Test
void testFilterXml() throws Exception {
final String xml = IOUtils.toString(getClass().getResourceAsStream("record.xml"));
final Set<String> validIds = new HashSet<>(Arrays.asList("opendoar____::1234", "opendoar____::4567"));
final Set<String> validTypes = new HashSet<>(Arrays.asList("1", "121"));
final Set<String> validTypes2 = new HashSet<>(Arrays.asList("1", "11"));
assertTrue(BaseCollectorPlugin.filterXml(xml, validIds, validTypes));
assertTrue(BaseCollectorPlugin.filterXml(xml, validIds, new HashSet<>()));
assertFalse(BaseCollectorPlugin.filterXml(xml, new HashSet<>(), validTypes));
assertFalse(BaseCollectorPlugin.filterXml(xml, validIds, validTypes2));
}
}

View File

@ -1,49 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.base;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
public class BaseRecordInfo implements Serializable {
private static final long serialVersionUID = -8848232018350074593L;
private String id;
private List<BaseCollectionInfo> collections = new ArrayList<>();
private List<String> paths = new ArrayList<>();
private List<String> types = new ArrayList<>();
public String getId() {
return this.id;
}
public void setId(final String id) {
this.id = id;
}
public List<String> getPaths() {
return this.paths;
}
public void setPaths(final List<String> paths) {
this.paths = paths;
}
public List<String> getTypes() {
return this.types;
}
public void setTypes(final List<String> types) {
this.types = types;
}
public List<BaseCollectionInfo> getCollections() {
return this.collections;
}
public void setCollections(final List<BaseCollectionInfo> collections) {
this.collections = collections;
}
}

View File

@ -1,78 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.base;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.junit.jupiter.MockitoExtension;
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest;
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
import eu.dnetlib.dhp.schema.mdstore.Provenance;
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@Disabled
@ExtendWith(MockitoExtension.class)
public class BaseTransfomationTest extends AbstractVocabularyTest {
private SparkConf sparkConf;
@BeforeEach
public void setUp() throws IOException, ISLookUpException {
setUpVocabulary();
this.sparkConf = new SparkConf();
this.sparkConf.setMaster("local[*]");
this.sparkConf.set("spark.driver.host", "localhost");
this.sparkConf.set("spark.ui.enabled", "false");
}
@Test
void testBase2ODF() throws Exception {
final MetadataRecord mr = new MetadataRecord();
mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("record.xml")));
final XSLTTransformationFunction tr = loadTransformationRule("xml/base2odf.transformationRule.xml");
final MetadataRecord result = tr.call(mr);
System.out.println(result.getBody());
}
@Test
void testBase2OAF() throws Exception {
final MetadataRecord mr = new MetadataRecord();
mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("record.xml")));
final XSLTTransformationFunction tr = loadTransformationRule("xml/base2oaf.transformationRule.xml");
final MetadataRecord result = tr.call(mr);
System.out.println(result.getBody());
}
private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception {
final String xslt = new SAXReader()
.read(this.getClass().getResourceAsStream(path))
.selectSingleNode("//CODE/*")
.asXML();
final LongAccumulator la = new LongAccumulator();
return new XSLTTransformationFunction(new AggregationCounter(la, la, la), xslt, 0, this.vocabularies);
}
}

View File

@ -1,58 +0,0 @@
<record>
<header xmlns="http://www.openarchives.org/OAI/2.0/">
<identifier>ftdoajarticles:oai:doaj.org/article:e2d5b5126b2d4e479933cc7f9a9ae0c1</identifier>
<datestamp>2022-12-31T11:48:55Z</datestamp>
</header>
<metadata xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dc="http://purl.org/dc/elements/1.1/">
<base_dc:dc xsi:schemaLocation="http://oai.base-search.net/base_dc/ http://oai.base-search.net/base_dc/base_dc.xsd">
<base_dc:global_id>ftdoajarticles:oai:doaj.org/article:e2d5b5126b2d4e479933cc7f9a9ae0c1</base_dc:global_id>
<base_dc:continent>cww</base_dc:continent>
<base_dc:country>org</base_dc:country>
<base_dc:collection opendoar_id="1234" ror_id="ror1234">ftdoajarticles</base_dc:collection>
<base_dc:collname>TEST REPO</base_dc:collname>
<dc:title>Assessment of cultural heritage: the legislative and methodological framework of Russian Federation</dc:title>
<dc:creator>ALBU, Svetlana</dc:creator>
<dc:creator>LEȘAN, Anna</dc:creator>
<dc:subject>architectural heritage</dc:subject>
<dc:subject>evaluation of architectural heritage</dc:subject>
<dc:subject>types of values</dc:subject>
<dc:subject>experience of russian federation</dc:subject>
<dc:subject>Social Sciences</dc:subject>
<dc:subject>H</dc:subject>
<dc:description>Architectural heritage is the real estate inheritance by population of a country becoming an extremely valuable and specific category, preserving and capitalizing on those assets requires considerable effort. The state does not have sufficient means to maintain and preserve cultural heritage, as a result it is included in the civil circuit. The transfer of property right or of some partial rights over the architectural patrimony is accompanied by the necessity to estimate the value of goods. In this article, the authors examine the experience of Russian Federation (one of the largest countries with a huge architectural heritage) on the legislative framework of architectural and methodological heritage of architectural heritage assessment. The particularities of cultural assets valuation compared to other categories of real estate are examined, as well as the methodological aspects (types of values, methods applied in valuation, approaches according to the purpose of valuation) regarding the valuation of real estate with architectural value in Russian Federation.</dc:description>
<dc:publisher>Technical University of Moldova</dc:publisher>
<dc:date>2020-09-01T00:00:00Z</dc:date>
<base_dc:year>2020</base_dc:year>
<dc:type>article</dc:type>
<base_dc:typenorm>121</base_dc:typenorm>
<dc:identifier>https://doi.org/10.5281/zenodo.3971988</dc:identifier>
<dc:identifier>https://doaj.org/article/e2d5b5126b2d4e479933cc7f9a9ae0c1</dc:identifier>
<base_dc:link>https://doi.org/10.5281/zenodo.3971988</base_dc:link>
<dc:source>Journal of Social Sciences, Vol 3, Iss 3, Pp 134-143 (2020)</dc:source>
<dc:language>EN</dc:language>
<dc:language>FR</dc:language>
<dc:language>RO</dc:language>
<dc:relation>http://ibn.idsi.md/sites/default/files/imag_file/JSS-3-2020_134-143.pdf</dc:relation>
<dc:relation>https://doaj.org/toc/2587-3490</dc:relation>
<dc:relation>https://doaj.org/toc/2587-3504</dc:relation>
<dc:relation>doi:10.5281/zenodo.3971988</dc:relation>
<dc:relation>2587-3490</dc:relation>
<dc:relation>2587-3504</dc:relation>
<dc:relation>https://doaj.org/article/e2d5b5126b2d4e479933cc7f9a9ae0c1</dc:relation>
<base_dc:autoclasscode type="ddc">720</base_dc:autoclasscode>
<base_dc:authod_id>
<base_dc:creator_name>ALBU, Svetlana</base_dc:creator_name>
<base_dc:creator_id>https://orcid.org/0000-0002-8648-950X</base_dc:creator_id>
</base_dc:authod_id>
<base_dc:authod_id>
<base_dc:creator_name>LEȘAN, Anna</base_dc:creator_name>
<base_dc:creator_id>https://orcid.org/0000-0003-3284-0525</base_dc:creator_id>
</base_dc:authod_id>
<base_dc:doi>https://doi.org/10.5281/zenodo.3971988</base_dc:doi>
<base_dc:oa>1</base_dc:oa>
<base_dc:lang>eng</base_dc:lang>
<base_dc:lang>fre</base_dc:lang>
<base_dc:lang>rum</base_dc:lang>
</base_dc:dc>
</metadata>
</record>

View File

@ -1,44 +1,15 @@
{"classification": "Signaling protein", "pdb": "5NM4", "deposition_date": "2017-04-05", "title": "A2a adenosine receptor room-temperature structure determined by serial Femtosecond crystallography", "Keywords": ["Oom-temperature", " serial crystallography", " signaling protein"], "authors": ["T.weinert", "R.cheng", "D.james", "D.gashi", "P.nogly", "K.jaeger", "M.hennig", "", "J.standfuss"], "pmid": "28912485", "doi": "10.1038/S41467-017-00630-4"}
{"classification": "Oxidoreductase/oxidoreductase inhibitor", "pdb": "4KN3", "deposition_date": "2013-05-08", "title": "Structure of the y34ns91g double mutant of dehaloperoxidase from Amphitrite ornata with 2,4,6-trichlorophenol", "Keywords": ["Lobin", " oxygen storage", " peroxidase", " oxidoreductase", " oxidoreductase-", "Oxidoreductase inhibitor complex"], "authors": ["C.wang", "L.lovelace", "L.lebioda"], "pmid": "23952341", "doi": "10.1021/BI400627W"}
{"classification": "Transport protein", "pdb": "8HKM", "deposition_date": "2022-11-27", "title": "Ion channel", "Keywords": ["On channel", " transport protein"], "authors": ["D.h.jiang", "J.t.zhang"], "pmid": "37494189", "doi": "10.1016/J.CELREP.2023.112858"}
{"classification": "Signaling protein", "pdb": "6JT1", "deposition_date": "2019-04-08", "title": "Structure of human soluble guanylate cyclase in the heme oxidised State", "Keywords": ["Oluble guanylate cyclase", " signaling protein"], "authors": ["L.chen", "Y.kang", "R.liu", "J.-x.wu"], "pmid": "31514202", "doi": "10.1038/S41586-019-1584-6"}
{"classification": "Immune system", "pdb": "7OW6", "deposition_date": "2021-06-16", "title": "Crystal structure of a tcr in complex with hla-a*11:01 bound to kras G12d peptide (vvvgadgvgk)", "Keywords": ["La", " kras", " tcr", " immune system"], "authors": ["V.karuppiah", "R.a.robinson"], "doi": "10.1038/S41467-022-32811-1"}
{"classification": "Biosynthetic protein", "pdb": "5EQ8", "deposition_date": "2015-11-12", "title": "Crystal structure of medicago truncatula histidinol-phosphate Phosphatase (mthpp) in complex with l-histidinol", "Keywords": ["Istidine biosynthesis", " metabolic pathways", " dimer", " plant", "", "Biosynthetic protein"], "authors": ["M.ruszkowski", "Z.dauter"], "pmid": "26994138", "doi": "10.1074/JBC.M115.708727"}
{"classification": "De novo protein", "pdb": "8CWA", "deposition_date": "2022-05-18", "title": "Solution nmr structure of 8-residue rosetta-designed cyclic peptide D8.21 in cdcl3 with cis/trans switching (tc conformation, 53%)", "Keywords": ["Yclic peptide", " non natural amino acids", " cis/trans", " switch peptides", "", "De novo design", "Membrane permeability", "De novo protein"], "authors": ["T.a.ramelot", "R.tejero", "G.t.montelione"], "pmid": "36041435", "doi": "10.1016/J.CELL.2022.07.019"}
{"classification": "Hydrolase", "pdb": "3R6M", "deposition_date": "2011-03-21", "title": "Crystal structure of vibrio parahaemolyticus yeaz", "Keywords": ["Ctin/hsp70 nucleotide-binding fold", " bacterial resuscitation", " viable", "But non-culturable state", "Resuscitation promoting factor", "Ygjd", "", "Yjee", "Vibrio parahaemolyticus", "Hydrolase"], "authors": ["A.roujeinikova", "I.aydin"], "pmid": "21858042", "doi": "10.1371/JOURNAL.PONE.0023245"}
{"classification": "Hydrolase", "pdb": "2W5J", "deposition_date": "2008-12-10", "title": "Structure of the c14-rotor ring of the proton translocating Chloroplast atp synthase", "Keywords": ["Ydrolase", " chloroplast", " atp synthase", " lipid-binding", " cf(0)", " membrane", "", "Transport", "Formylation", "Energy transduction", "Hydrogen ion transport", "", "Ion transport", "Transmembrane", "Membrane protein"], "authors": ["M.vollmar", "D.schlieper", "M.winn", "C.buechner", "G.groth"], "pmid": "19423706", "doi": "10.1074/JBC.M109.006916"}
{"classification": "De novo protein", "pdb": "4GLU", "deposition_date": "2012-08-14", "title": "Crystal structure of the mirror image form of vegf-a", "Keywords": ["-protein", " covalent dimer", " cysteine knot protein", " growth factor", " de", "Novo protein"], "authors": ["K.mandal", "M.uppalapati", "D.ault-riche", "J.kenney", "J.lowitz", "S.sidhu", "", "S.b.h.kent"], "pmid": "22927390", "doi": "10.1073/PNAS.1210483109"}
{"classification": "Hydrolase/hydrolase inhibitor", "pdb": "3WYL", "deposition_date": "2014-09-01", "title": "Crystal structure of the catalytic domain of pde10a complexed with 5- Methoxy-3-(1-phenyl-1h-pyrazol-5-yl)-1-(3-(trifluoromethyl)phenyl) Pyridazin-4(1h)-one", "Keywords": ["Ydrolase-hydrolase inhibitor complex"], "authors": ["H.oki", "Y.hayano"], "pmid": "25384088", "doi": "10.1021/JM5013648"}
{"classification": "Isomerase", "pdb": "5BOR", "deposition_date": "2015-05-27", "title": "Structure of acetobacter aceti pure-s57c, sulfonate form", "Keywords": ["Cidophile", " pure", " purine biosynthesis", " isomerase"], "authors": ["K.l.sullivan", "T.j.kappock"]}
{"classification": "Hydrolase", "pdb": "1X0C", "deposition_date": "2005-03-17", "title": "Improved crystal structure of isopullulanase from aspergillus niger Atcc 9642", "Keywords": ["Ullulan", " glycoside hydrolase family 49", " glycoprotein", " hydrolase"], "authors": ["M.mizuno", "T.tonozuka", "A.yamamura", "Y.miyasaka", "H.akeboshi", "S.kamitori", "", "A.nishikawa", "Y.sakano"], "pmid": "18155243", "doi": "10.1016/J.JMB.2007.11.098"}
{"classification": "Oxidoreductase", "pdb": "7CUP", "deposition_date": "2020-08-23", "title": "Structure of 2,5-dihydroxypridine dioxygenase from pseudomonas putida Kt2440", "Keywords": ["On-heme dioxygenase", " oxidoreductase"], "authors": ["G.q.liu", "H.z.tang"]}
{"classification": "Ligase", "pdb": "1VCN", "deposition_date": "2004-03-10", "title": "Crystal structure of t.th. hb8 ctp synthetase complex with sulfate Anion", "Keywords": ["Etramer", " riken structural genomics/proteomics initiative", " rsgi", "", "Structural genomics", "Ligase"], "authors": ["M.goto", "Riken structural genomics/proteomics initiative (rsgi)"], "pmid": "15296735", "doi": "10.1016/J.STR.2004.05.013"}
{"classification": "Transferase/transferase inhibitor", "pdb": "6C9V", "deposition_date": "2018-01-28", "title": "Mycobacterium tuberculosis adenosine kinase bound to (2r,3s,4r,5r)-2- (hydroxymethyl)-5-(6-(4-phenylpiperazin-1-yl)-9h-purin-9-yl) Tetrahydrofuran-3,4-diol", "Keywords": ["Ucleoside analog", " complex", " inhibitor", " structural genomics", " psi-2", "", "Protein structure initiative", "Tb structural genomics consortium", "", "Tbsgc", "Transferase-transferase inhibitor complex"], "authors": ["R.a.crespo", "Tb structural genomics consortium (tbsgc)"], "pmid": "31002508", "doi": "10.1021/ACS.JMEDCHEM.9B00020"}
{"classification": "De novo protein", "pdb": "4LPY", "deposition_date": "2013-07-16", "title": "Crystal structure of tencon variant g10", "Keywords": ["Ibronectin type iii fold", " alternate scaffold", " de novo protein"], "authors": ["A.teplyakov", "G.obmolova", "G.l.gilliland"], "pmid": "24375666", "doi": "10.1002/PROT.24502"}
{"classification": "Isomerase", "pdb": "2Y88", "deposition_date": "2011-02-03", "title": "Crystal structure of mycobacterium tuberculosis phosphoribosyl Isomerase (variant d11n) with bound prfar", "Keywords": ["Romatic amino acid biosynthesis", " isomerase", " tim-barrel", " histidine", "Biosynthesis", "Tryptophan biosynthesis"], "authors": ["J.kuper", "A.v.due", "A.geerlof", "M.wilmanns"], "pmid": "21321225", "doi": "10.1073/PNAS.1015996108"}
{"classification": "Unknown function", "pdb": "1SR0", "deposition_date": "2004-03-22", "title": "Crystal structure of signalling protein from sheep(sps-40) at 3.0a Resolution using crystal grown in the presence of polysaccharides", "Keywords": ["Ignalling protein", " involution", " unknown function"], "authors": ["D.b.srivastava", "A.s.ethayathulla", "N.singh", "J.kumar", "S.sharma", "T.p.singh"]}
{"classification": "Dna binding protein", "pdb": "3RH2", "deposition_date": "2011-04-11", "title": "Crystal structure of a tetr-like transcriptional regulator (sama_0099) From shewanella amazonensis sb2b at 2.42 a resolution", "Keywords": ["Na/rna-binding 3-helical bundle", " structural genomics", " joint center", "For structural genomics", "Jcsg", "Protein structure initiative", "Psi-", "Biology", "Dna binding protein"], "authors": ["Joint center for structural genomics (jcsg)"]}
{"classification": "Transferase", "pdb": "2WK5", "deposition_date": "2009-06-05", "title": "Structural features of native human thymidine phosphorylase And in complex with 5-iodouracil", "Keywords": ["Lycosyltransferase", " developmental protein", " angiogenesis", "", "5-iodouracil", "Growth factor", "Enzyme kinetics", "", "Differentiation", "Disease mutation", "Thymidine", "Phosphorylase", "Chemotaxis", "Transferase", "Mutagenesis", "", "Polymorphism"], "authors": ["E.mitsiki", "A.c.papageorgiou", "S.iyer", "N.thiyagarajan", "S.h.prior", "", "D.sleep", "C.finnis", "K.r.acharya"], "pmid": "19555658", "doi": "10.1016/J.BBRC.2009.06.104"}
{"classification": "Hydrolase", "pdb": "3P9Y", "deposition_date": "2010-10-18", "title": "Crystal structure of the drosophila melanogaster ssu72-pctd complex", "Keywords": ["Hosphatase", " cis proline", " lmw ptp-like fold", " rna polymerase ii ctd", "", "Hydrolase"], "authors": ["J.w.werner-allen", "P.zhou"], "pmid": "21159777", "doi": "10.1074/JBC.M110.197129"}
{"classification": "Recombination/dna", "pdb": "6OEO", "deposition_date": "2019-03-27", "title": "Cryo-em structure of mouse rag1/2 nfc complex (dna1)", "Keywords": ["(d)j recombination", " dna transposition", " rag", " scid", " recombination", "", "Recombination-dna complex"], "authors": ["X.chen", "Y.cui", "Z.h.zhou", "W.yang", "M.gellert"], "pmid": "32015552", "doi": "10.1038/S41594-019-0363-2"}
{"classification": "Hydrolase", "pdb": "4ECA", "deposition_date": "1997-02-21", "title": "Asparaginase from e. coli, mutant t89v with covalently bound aspartate", "Keywords": ["Ydrolase", " acyl-enzyme intermediate", " threonine amidohydrolase"], "authors": ["G.j.palm", "J.lubkowski", "A.wlodawer"], "pmid": "8706862", "doi": "10.1016/0014-5793(96)00660-6"}
{"classification": "Transcription/protein binding", "pdb": "3UVX", "deposition_date": "2011-11-30", "title": "Crystal structure of the first bromodomain of human brd4 in complex With a diacetylated histone 4 peptide (h4k12ack16ac)", "Keywords": ["Romodomain", " bromodomain containing protein 4", " cap", " hunk1", " mcap", "", "Mitotic chromosome associated protein", "Peptide complex", "Structural", "Genomics consortium", "Sgc", "Transcription-protein binding complex"], "authors": ["P.filippakopoulos", "S.picaud", "T.keates", "E.ugochukwu", "F.von delft", "", "C.h.arrowsmith", "A.m.edwards", "J.weigelt", "C.bountra", "S.knapp", "Structural", "Genomics consortium (sgc)"], "pmid": "22464331", "doi": "10.1016/J.CELL.2012.02.013"}
{"classification": "Membrane protein", "pdb": "1TLZ", "deposition_date": "2004-06-10", "title": "Tsx structure complexed with uridine", "Keywords": ["Ucleoside transporter", " beta barrel", " uridine", " membrane", "Protein"], "authors": ["J.ye", "B.van den berg"], "pmid": "15272310", "doi": "10.1038/SJ.EMBOJ.7600330"}
{"classification": "Dna binding protein", "pdb": "7AZD", "deposition_date": "2020-11-16", "title": "Dna polymerase sliding clamp from escherichia coli with peptide 20 Bound", "Keywords": ["Ntibacterial drug", " dna binding protein"], "authors": ["C.monsarrat", "G.compain", "C.andre", "I.martiel", "S.engilberge", "V.olieric", "", "P.wolff", "K.brillet", "M.landolfo", "C.silva da veiga", "J.wagner", "G.guichard", "", "D.y.burnouf"], "pmid": "34806883", "doi": "10.1021/ACS.JMEDCHEM.1C00918"}
{"classification": "Transferase", "pdb": "5N3K", "deposition_date": "2017-02-08", "title": "Camp-dependent protein kinase a from cricetulus griseus in complex With fragment like molecule o-guanidino-l-homoserine", "Keywords": ["Ragment", " complex", " transferase", " serine threonine kinase", " camp", "", "Kinase", "Pka"], "authors": ["C.siefker", "A.heine", "G.klebe"]}
{"classification": "Biosynthetic protein", "pdb": "8H52", "deposition_date": "2022-10-11", "title": "Crystal structure of helicobacter pylori carboxyspermidine Dehydrogenase in complex with nadp", "Keywords": ["Arboxyspermidine dehydrogenase", " biosynthetic protein"], "authors": ["K.y.ko", "S.c.park", "S.y.cho", "S.i.yoon"], "pmid": "36283333", "doi": "10.1016/J.BBRC.2022.10.049"}
{"classification": "Metal binding protein", "pdb": "6DYC", "deposition_date": "2018-07-01", "title": "Co(ii)-bound structure of the engineered cyt cb562 variant, ch3", "Keywords": ["Esigned protein", " 4-helix bundle", " electron transport", " metal binding", "Protein"], "authors": ["F.a.tezcan", "J.rittle"], "pmid": "30778140", "doi": "10.1038/S41557-019-0218-9"}
{"classification": "Protein fibril", "pdb": "6A6B", "deposition_date": "2018-06-27", "title": "Cryo-em structure of alpha-synuclein fiber", "Keywords": ["Lpha-syn fiber", " parkinson disease", " protein fibril"], "authors": ["Y.w.li", "C.y.zhao", "F.luo", "Z.liu", "X.gui", "Z.luo", "X.zhang", "D.li", "C.liu", "X.li"], "pmid": "30065316", "doi": "10.1038/S41422-018-0075-X"}
{"classification": "Dna", "pdb": "7D5E", "deposition_date": "2020-09-25", "title": "Left-handed g-quadruplex containing two bulges", "Keywords": ["-quadruplex", " bulge", " dna", " left-handed"], "authors": ["P.das", "A.maity", "K.h.ngo", "F.r.winnerdy", "B.bakalar", "Y.mechulam", "E.schmitt", "", "A.t.phan"], "pmid": "33503265", "doi": "10.1093/NAR/GKAA1259"}
{"classification": "Transferase", "pdb": "3RSY", "deposition_date": "2011-05-02", "title": "Cellobiose phosphorylase from cellulomonas uda in complex with sulfate And glycerol", "Keywords": ["H94", " alpha barrel", " cellobiose phosphorylase", " disaccharide", "Phosphorylase", "Transferase"], "authors": ["A.van hoorebeke", "J.stout", "W.soetaert", "J.van beeumen", "T.desmet", "S.savvides"]}
{"classification": "Oxidoreductase", "pdb": "7MCI", "deposition_date": "2021-04-02", "title": "Mofe protein from azotobacter vinelandii with a sulfur-replenished Cofactor", "Keywords": ["Zotobacter vinelandii", " mofe-protein", " nitrogenase", " oxidoreductase"], "authors": ["W.kang", "C.lee", "Y.hu", "M.w.ribbe"], "doi": "10.1038/S41929-022-00782-7"}
{"classification": "Dna", "pdb": "1XUW", "deposition_date": "2004-10-26", "title": "Structural rationalization of a large difference in rna affinity Despite a small difference in chemistry between two 2'-o-modified Nucleic acid analogs", "Keywords": ["Na mimetic methylcarbamate amide analog", " dna"], "authors": ["R.pattanayek", "L.sethaphong", "C.pan", "M.prhavc", "T.p.prakash", "M.manoharan", "", "M.egli"], "pmid": "15547979", "doi": "10.1021/JA044637K"}
{"classification": "Lyase", "pdb": "7C0D", "deposition_date": "2020-05-01", "title": "Crystal structure of azospirillum brasilense l-2-keto-3-deoxyarabonate Dehydratase (hydroxypyruvate-bound form)", "Keywords": ["-2-keto-3-deoxyarabonate dehydratase", " lyase"], "authors": ["Y.watanabe", "S.watanabe"], "pmid": "32697085", "doi": "10.1021/ACS.BIOCHEM.0C00515"}
{"classification": "Signaling protein", "pdb": "5LYK", "deposition_date": "2016-09-28", "title": "Crystal structure of intracellular b30.2 domain of btn3a1 bound to Citrate", "Keywords": ["30.2", " butyrophilin", " signaling protein"], "authors": ["F.mohammed", "A.t.baker", "M.salim", "B.e.willcox"], "pmid": "28862425", "doi": "10.1021/ACSCHEMBIO.7B00694"}
{"classification": "Toxin", "pdb": "4IZL", "deposition_date": "2013-01-30", "title": "Structure of the n248a mutant of the panton-valentine leucocidin s Component from staphylococcus aureus", "Keywords": ["I-component leucotoxin", " staphylococcus aureus", " s component", "Leucocidin", "Beta-barrel pore forming toxin", "Toxin"], "authors": ["L.maveyraud", "B.j.laventie", "G.prevost", "L.mourey"], "pmid": "24643034", "doi": "10.1371/JOURNAL.PONE.0092094"}
{"classification": "Dna", "pdb": "6F3C", "deposition_date": "2017-11-28", "title": "The cytotoxic [pt(h2bapbpy)] platinum complex interacting with the Cgtacg hexamer", "Keywords": ["Rug-dna complex", " four-way junction", " dna"], "authors": ["M.ferraroni", "C.bazzicalupi", "P.gratteri", "F.papi"], "pmid": "31046177", "doi": "10.1002/ANIE.201814532"}
{"classification": "Signaling protein/inhibitor", "pdb": "4L5M", "deposition_date": "2013-06-11", "title": "Complexe of arno sec7 domain with the protein-protein interaction Inhibitor n-(4-hydroxy-2,6-dimethylphenyl)benzenesulfonamide at ph6.5", "Keywords": ["Ec-7domain", " signaling protein-inhibitor complex"], "authors": ["F.hoh", "J.rouhana"], "pmid": "24112024", "doi": "10.1021/JM4009357"}
{"classification": "Signaling protein", "pdb": "5I6J", "deposition_date": "2016-02-16", "title": "Crystal structure of srgap2 f-barx", "Keywords": ["Rgap2", " f-bar", " fx", " signaling protein"], "authors": ["M.sporny", "J.guez-haddad", "M.n.isupov", "Y.opatowsky"], "pmid": "28333212", "doi": "10.1093/MOLBEV/MSX094"}
{"classification": "Metal binding protein", "pdb": "1Q80", "deposition_date": "2003-08-20", "title": "Solution structure and dynamics of nereis sarcoplasmic calcium binding Protein", "Keywords": ["Ll-alpha", " metal binding protein"], "authors": ["G.rabah", "R.popescu", "J.a.cox", "Y.engelborghs", "C.t.craescu"], "pmid": "15819893", "doi": "10.1111/J.1742-4658.2005.04629.X"}
{"classification": "Transferase", "pdb": "1TW1", "deposition_date": "2004-06-30", "title": "Beta-1,4-galactosyltransferase mutant met344his (m344h-gal-t1) complex With udp-galactose and magnesium", "Keywords": ["Et344his mutation; closed conformation; mn binding", " transferase"], "authors": ["B.ramakrishnan", "E.boeggeman", "P.k.qasba"], "pmid": "15449940", "doi": "10.1021/BI049007+"}
{"classification": "Rna", "pdb": "2PN4", "deposition_date": "2007-04-23", "title": "Crystal structure of hepatitis c virus ires subdomain iia", "Keywords": ["Cv", " ires", " subdoamin iia", " rna", " strontium", " hepatitis"], "authors": ["Q.zhao", "Q.han", "C.r.kissinger", "P.a.thompson"], "pmid": "18391410", "doi": "10.1107/S0907444908002011"}
{"pdb": "1CW0", "title": "crystal structure analysis of very short patch repair (vsr) endonuclease in complex with a duplex dna", "authors": ["S.E.Tsutakawa", "H.Jingami", "K.Morikawa"], "doi": "10.1016/S0092-8674(00)81550-0", "pmid": "10612397"}
{"pdb": "2CWW", "title": "crystal structure of thermus thermophilus ttha1280, a putative sam- dependent rna methyltransferase, in complex with s-adenosyl-l- homocysteine", "authors": ["A.A.Pioszak", "K.Murayama", "N.Nakagawa", "A.Ebihara", "S.Kuramitsu", "M.Shirouzu", "S.Yokoyama", "Riken Structural Genomics/proteomics Initiative (Rsgi)"], "doi": "10.1107/S1744309105029842", "pmid": "16511182"}
{"pdb": "6CWE", "title": "structure of alpha-gsa[8,6p] bound by cd1d and in complex with the va14vb8.2 tcr", "authors": ["J.Wang", "D.Zajonc"], "doi": null, "pmid": null}
{"pdb": "5CWS", "title": "crystal structure of the intact chaetomium thermophilum nsp1-nup49- nup57 channel nucleoporin heterotrimer bound to its nic96 nuclear pore complex attachment site", "authors": ["C.J.Bley", "S.Petrovic", "M.Paduch", "V.Lu", "A.A.Kossiakoff", "A.Hoelz"], "doi": "10.1126/SCIENCE.AAC9176", "pmid": "26316600"}
{"pdb": "5CWE", "title": "structure of cyp107l2 from streptomyces avermitilis with lauric acid", "authors": ["T.-V.Pham", "S.-H.Han", "J.-H.Kim", "D.-H.Kim", "L.-W.Kang"], "doi": null, "pmid": null}
{"pdb": "7CW4", "title": "acetyl-coa acetyltransferase from bacillus cereus atcc 14579", "authors": ["J.Hong", "K.J.Kim"], "doi": "10.1016/J.BBRC.2020.09.048", "pmid": "32972748"}
{"pdb": "2CWP", "title": "crystal structure of metrs related protein from pyrococcus horikoshii", "authors": ["K.Murayama", "M.Kato-Murayama", "M.Shirouzu", "S.Yokoyama", "Riken StructuralGenomics/proteomics Initiative (Rsgi)"], "doi": null, "pmid": null}
{"pdb": "2CW7", "title": "crystal structure of intein homing endonuclease ii", "authors": ["H.Matsumura", "H.Takahashi", "T.Inoue", "H.Hashimoto", "M.Nishioka", "S.Fujiwara", "M.Takagi", "T.Imanaka", "Y.Kai"], "doi": "10.1002/PROT.20858", "pmid": "16493661"}
{"pdb": "1CWU", "title": "brassica napus enoyl acp reductase a138g mutant complexed with nad+ and thienodiazaborine", "authors": ["A.Roujeinikova", "J.B.Rafferty", "D.W.Rice"], "doi": "10.1074/JBC.274.43.30811", "pmid": "10521472"}
{"pdb": "3CWN", "title": "escherichia coli transaldolase b mutant f178y", "authors": ["T.Sandalova", "G.Schneider", "A.Samland"], "doi": "10.1074/JBC.M803184200", "pmid": "18687684"}
{"pdb": "1CWL", "title": "human cyclophilin a complexed with 4 4-hydroxy-meleu cyclosporin", "authors": ["V.Mikol", "J.Kallen", "P.Taylor", "M.D.Walkinshaw"], "doi": "10.1006/JMBI.1998.2108", "pmid": "9769216"}
{"pdb": "3CW2", "title": "crystal structure of the intact archaeal translation initiation factor 2 from sulfolobus solfataricus .", "authors": ["E.A.Stolboushkina", "S.V.Nikonov", "A.D.Nikulin", "U.Blaesi", "D.J.Manstein", "R.V.Fedorov", "M.B.Garber", "O.S.Nikonov"], "doi": "10.1016/J.JMB.2008.07.039", "pmid": "18675278"}
{"pdb": "3CW9", "title": "4-chlorobenzoyl-coa ligase/synthetase in the thioester-forming conformation, bound to 4-chlorophenacyl-coa", "authors": ["A.S.Reger", "J.Cao", "R.Wu", "D.Dunaway-Mariano", "A.M.Gulick"], "doi": "10.1021/BI800696Y", "pmid": "18620418"}
{"pdb": "3CWU", "title": "crystal structure of an alka host/guest complex 2'-fluoro-2'-deoxy-1, n6-ethenoadenine:thymine base pair", "authors": ["B.R.Bowman", "S.Lee", "S.Wang", "G.L.Verdine"], "doi": "10.1016/J.STR.2008.04.012", "pmid": "18682218"}
{"pdb": "5CWF", "title": "crystal structure of de novo designed helical repeat protein dhr8", "authors": ["G.Bhabha", "D.C.Ekiert"], "doi": "10.1038/NATURE16162", "pmid": "26675729"}

View File

@ -1,36 +1,6 @@
{"pid": " Q6GZX4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 43"}], "title": "Putative transcription factor 001R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZX3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 45"}], "title": "Uncharacterized protein 002L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197F8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 29"}], "title": "Uncharacterized protein 002R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q197F7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 23"}], "title": "Uncharacterized protein 003L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZX2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 3R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZX1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 38"}], "title": "Uncharacterized protein 004R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197F5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 005L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZX0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 47"}], "title": "Uncharacterized protein 005R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q91G88", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-06-28", "date_info": "entry version 53"}], "title": "Putative KilA-N domain-containing protein 006L", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
{"pid": " Q6GZW9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 34"}], "title": "Uncharacterized protein 006R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZW8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 32"}], "title": "Uncharacterized protein 007R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197F3", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 007R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q197F2", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 22"}], "title": "Uncharacterized protein 008L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZW6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 67"}], "title": "Putative helicase 009L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q91G85", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 38"}], "title": "Uncharacterized protein 009R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
{"pid": " Q6GZW5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 010R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197E9", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 011L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZW4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 011R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZW3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 012L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197E7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 37"}], "title": "Uncharacterized protein IIV3-013L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZW2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 30"}], "title": "Uncharacterized protein 013R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZW1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 014R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZW0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 50"}], "title": "Uncharacterized protein 015R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZV8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 017L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZV7", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 018L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZV6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 87"}], "title": "Putative serine/threonine-protein kinase 019R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZV5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 40"}], "title": "Uncharacterized protein 020R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZV4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 021L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197D8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-12-14", "date_info": "entry version 35"}], "title": "Transmembrane protein 022L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZV2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 023R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197D7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 25"}], "title": "Uncharacterized protein 023R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q6GZV1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 024R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q197D5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 24"}], "title": "Uncharacterized protein 025R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
{"pid": " Q91G70", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 026R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
{"pid": " Q6GZU9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 49"}], "title": "Uncharacterized protein 027R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": " Q6GZU8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 55"}], "title": "Uncharacterized protein 028R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": "Q6GZX4", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 41."}], "title": "Putative transcription factor 001R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": "Q6GZX3", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 42."}], "title": "Uncharacterized protein 002L;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": "Q197F8", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 27."}], "title": "Uncharacterized protein 002R;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]}
{"pid": "Q197F7", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 23."}], "title": "Uncharacterized protein 003L;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]}
{"pid": "Q6GZX2", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 36."}], "title": "Uncharacterized protein 3R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
{"pid": "Q6GZX1", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 34."}], "title": "Uncharacterized protein 004R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}

View File

@ -1496,30 +1496,4 @@ cnr:institutes @=@ __CDS131__ @=@ IBE - Istituto per la BioEconomia
cnr:institutes @=@ https://ror.org/0263zy895 @=@ CDS132
cnr:institutes @=@ https://ror.org/0263zy895 @=@ SCITEC - Istituto di Scienze e Tecnologie Chimiche \"Giulio Natta\"
cnr:institutes @=@ __CDS133__ @=@ CDS133
cnr:institutes @=@ __CDS133__ @=@ STEMS - Istituto di Scienze e Tecnologie per l'Energia e la Mobilità Sostenibili
base:normalized_types @=@ Text @=@ 1
base:normalized_types @=@ Book @=@ 11
base:normalized_types @=@ Book part @=@ 111
base:normalized_types @=@ Journal/Newspaper @=@ 12
base:normalized_types @=@ Article contribution @=@ 121
base:normalized_types @=@ Other non-article @=@ 122
base:normalized_types @=@ Conference object @=@ 13
base:normalized_types @=@ Report @=@ 14
base:normalized_types @=@ Review @=@ 15
base:normalized_types @=@ Course material @=@ 16
base:normalized_types @=@ Lecture @=@ 17
base:normalized_types @=@ Thesis @=@ 18
base:normalized_types @=@ Bachelor's thesis @=@ 181
base:normalized_types @=@ Master's thesis @=@ 182
base:normalized_types @=@ Doctoral and postdoctoral thesis @=@ 183
base:normalized_types @=@ Manuscript @=@ 19
base:normalized_types @=@ Patent @=@ 1A
base:normalized_types @=@ Musical notation @=@ 2
base:normalized_types @=@ Map @=@ 3
base:normalized_types @=@ Audio @=@ 4
base:normalized_types @=@ Image/Video @=@ 5
base:normalized_types @=@ Still image @=@ 51
base:normalized_types @=@ Moving image/Video @=@ 52
base:normalized_types @=@ Software @=@ 6
base:normalized_types @=@ Dataset @=@ 7
base:normalized_types @=@ Unknown @=@ F
cnr:institutes @=@ __CDS133__ @=@ STEMS - Istituto di Scienze e Tecnologie per l'Energia e la Mobilità Sostenibili

View File

@ -1210,29 +1210,4 @@ cnr:institutes @=@ cnr:institutes @=@ __CDS130__ @=@ __CDS130__
cnr:institutes @=@ cnr:institutes @=@ __CDS131__ @=@ __CDS131__
cnr:institutes @=@ cnr:institutes @=@ https://ror.org/0263zy895 @=@ https://ror.org/0263zy895
cnr:institutes @=@ cnr:institutes @=@ __CDS133__ @=@ __CDS133__
base:normalized_types @=@ base:normalized_types @=@ Text @=@ Text
base:normalized_types @=@ base:normalized_types @=@ Book @=@ Book
base:normalized_types @=@ base:normalized_types @=@ Book part @=@ Book part
base:normalized_types @=@ base:normalized_types @=@ Journal/Newspaper @=@ Journal/Newspaper
base:normalized_types @=@ base:normalized_types @=@ Article contribution @=@ Article contribution
base:normalized_types @=@ base:normalized_types @=@ Other non-article @=@ Other non-article
base:normalized_types @=@ base:normalized_types @=@ Conference object @=@ Conference object
base:normalized_types @=@ base:normalized_types @=@ Report @=@ Report
base:normalized_types @=@ base:normalized_types @=@ Review @=@ Review
base:normalized_types @=@ base:normalized_types @=@ Course material @=@ Course material
base:normalized_types @=@ base:normalized_types @=@ Lecture @=@ Lecture
base:normalized_types @=@ base:normalized_types @=@ Thesis @=@ Thesis
base:normalized_types @=@ base:normalized_types @=@ Bachelor's thesis @=@ Bachelor's thesis
base:normalized_types @=@ base:normalized_types @=@ Master's thesis @=@ Master's thesis
base:normalized_types @=@ base:normalized_types @=@ Doctoral and postdoctoral thesis @=@ Doctoral and postdoctoral thesis
base:normalized_types @=@ base:normalized_types @=@ Manuscript @=@ Manuscript
base:normalized_types @=@ base:normalized_types @=@ Patent @=@ Patent
base:normalized_types @=@ base:normalized_types @=@ Musical notation @=@ Musical notation
base:normalized_types @=@ base:normalized_types @=@ Map @=@ Map
base:normalized_types @=@ base:normalized_types @=@ Audio @=@ Audio
base:normalized_types @=@ base:normalized_types @=@ Image/Video @=@ Image/Video
base:normalized_types @=@ base:normalized_types @=@ Still image @=@ Still image
base:normalized_types @=@ base:normalized_types @=@ Moving image/Video @=@ Moving image/Video
base:normalized_types @=@ base:normalized_types @=@ Software @=@ Software
base:normalized_types @=@ base:normalized_types @=@ Dataset @=@ Dataset
base:normalized_types @=@ base:normalized_types @=@ Unknown @=@ Unknown

View File

@ -15,12 +15,4 @@
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>pivotHistoryDatabase</name>
<value>&#x200B;</value>
</property>
</configuration>

View File

@ -198,8 +198,6 @@
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
</spark>
<ok to="PrepareOrgRels"/>
<error to="Kill"/>

View File

@ -1,6 +1,26 @@
[
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the OAF Orcid transformed", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source path ", "paramRequired": false},
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the path of the OAF Orcid transformed",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "isLookupUrl",
"paramDescription": "the isLookup URL",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source path ",
"paramRequired": false
},
{
"paramName": "m",
"paramLongName": "master",
"paramDescription": "the master name",
"paramRequired": true
}
]

View File

@ -73,6 +73,12 @@
"name": "Irish Nephrology Society",
"synonym": []
},
{
"id": "100011062",
"uri": "http://dx.doi.org/10.13039/100011062",
"name": "Asian Spinal Cord Network",
"synonym": []
},
{
"id": "100011096",
"uri": "http://dx.doi.org/10.13039/100011096",
@ -217,6 +223,12 @@
"name": "Global Brain Health Institute",
"synonym": []
},
{
"id": "100015776",
"uri": "http://dx.doi.org/10.13039/100015776",
"name": "Health and Social Care Board",
"synonym": []
},
{
"id": "100015992",
"uri": "http://dx.doi.org/10.13039/100015992",
@ -391,6 +403,18 @@
"name": "Irish Hospice Foundation",
"synonym": []
},
{
"id": "501100001596",
"uri": "http://dx.doi.org/10.13039/501100001596",
"name": "Irish Research Council for Science, Engineering and Technology",
"synonym": []
},
{
"id": "501100001597",
"uri": "http://dx.doi.org/10.13039/501100001597",
"name": "Irish Research Council for the Humanities and Social Sciences",
"synonym": []
},
{
"id": "501100001598",
"uri": "http://dx.doi.org/10.13039/501100001598",
@ -491,7 +515,7 @@
"id": "501100002081",
"uri": "http://dx.doi.org/10.13039/501100002081",
"name": "Irish Research Council",
"synonym": ["501100001596", "501100001597"]
"synonym": []
},
{
"id": "501100002736",

View File

@ -1,5 +1,6 @@
package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils}
@ -47,67 +48,16 @@ case object Crossref2Oaf {
json.extract[List[funderInfo]]
}
val mappingCrossrefType = Map(
"book-section" -> "publication",
"book" -> "publication",
"book-chapter" -> "publication",
"book-part" -> "publication",
"book-series" -> "publication",
"book-set" -> "publication",
"book-track" -> "publication",
"edited-book" -> "publication",
"reference-book" -> "publication",
"monograph" -> "publication",
"journal-article" -> "publication",
"dissertation" -> "publication",
"other" -> "publication",
"peer-review" -> "publication",
"proceedings" -> "publication",
"proceedings-article" -> "publication",
"reference-entry" -> "publication",
"report" -> "publication",
"report-series" -> "publication",
"standard" -> "publication",
"standard-series" -> "publication",
"posted-content" -> "publication",
"dataset" -> "dataset"
)
val mappingCrossrefSubType = Map(
"book-section" -> "0013 Part of book or chapter of book",
"book" -> "0002 Book",
"book-chapter" -> "0013 Part of book or chapter of book",
"book-part" -> "0013 Part of book or chapter of book",
"book-series" -> "0002 Book",
"book-set" -> "0002 Book",
"book-track" -> "0002 Book",
"edited-book" -> "0002 Book",
"reference-book" -> "0002 Book",
"monograph" -> "0002 Book",
"journal-article" -> "0001 Article",
"dissertation" -> "0044 Thesis",
"other" -> "0038 Other literature type",
"peer-review" -> "0015 Review",
"proceedings" -> "0004 Conference object",
"proceedings-article" -> "0004 Conference object",
"reference-entry" -> "0013 Part of book or chapter of book",
"report" -> "0017 Report",
"report-series" -> "0017 Report",
"standard" -> "0038 Other literature type",
"standard-series" -> "0038 Other literature type",
"dataset" -> "0021 Dataset",
"preprint" -> "0016 Preprint",
"report" -> "0017 Report"
)
def getIrishId(doi: String): Option[String] = {
def getIrishId(doi: String): Option[String] = {
val id = doi.split("/").last
irishFunder
.find(f => id.equalsIgnoreCase(f.id) || (f.synonym.nonEmpty && f.synonym.exists(s => s.equalsIgnoreCase(id))))
.map(f => f.id)
}
def mappingResult(result: Result, json: JValue, cobjCategory: String, originalType: String): Result = {
def mappingResult(result: Result, json: JValue, instanceType:Qualifier, originalType: String): Result = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
//MAPPING Crossref DOI into PID
@ -275,27 +225,20 @@ case object Crossref2Oaf {
instance.setAccessright(
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
)
instance.setInstancetype(
OafMapperUtils.qualifier(
cobjCategory.substring(0, 4),
cobjCategory.substring(5),
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
instance.setInstancetype(instanceType)
//ADD ORIGINAL TYPE to the mapping
val itm = new InstanceTypeMapping
itm.setOriginalType(originalType)
itm.setVocabularyName(ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1)
instance.setInstanceTypeMapping(List(itm).asJava)
result.setResourcetype(
OafMapperUtils.qualifier(
cobjCategory.substring(0, 4),
cobjCategory.substring(5),
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
// result.setResourcetype(
// OafMapperUtils.qualifier(
// cobjCategory.substring(0, 4),
// cobjCategory.substring(5),
// ModelConstants.DNET_PUBLICATION_RESOURCE,
// ModelConstants.DNET_PUBLICATION_RESOURCE
// )
// )
instance.setCollectedfrom(createCrossrefCollectedFrom())
if (StringUtils.isNotBlank(issuedDate)) {
@ -354,7 +297,40 @@ case object Crossref2Oaf {
a
}
def convert(input: String): List[Oaf] = {
/** *
* Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type.
* Using the dnet:result_typologies vocabulary, we look up the instance.type synonym
* to generate one of the following main entities:
* - publication
* - dataset
* - software
* - otherresearchproduct
*
* @param resourceType
* @param vocabularies
* @return
*/
def getTypeQualifier(
resourceType: String,
vocabularies: VocabularyGroup
): (Qualifier, Qualifier, String) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null)
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
),
resourceType
)
}
null
}
def convert(input: String, vocabularies: VocabularyGroup): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@ -364,17 +340,17 @@ case object Crossref2Oaf {
val objectSubType = (json \ "subtype").extractOrElse[String](null)
if (objectType == null)
return resultList
val typology =getTypeQualifier(objectType, vocabularies)
val result = generateItemFromType(objectType, objectSubType)
if (typology == null)
return List()
val result = generateItemFromType(typology._2)
if (result == null)
return List()
val cOBJCategory = mappingCrossrefSubType.getOrElse(
objectType,
mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")
)
val originalType = if (mappingCrossrefSubType.contains(objectType)) objectType else objectSubType
mappingResult(result, json, cOBJCategory, originalType)
mappingResult(result, json, typology._1, typology._3)
if (result == null || result.getId == null)
return List()
@ -392,7 +368,7 @@ case object Crossref2Oaf {
}
result match {
case publication: Publication => convertPublication(publication, json, cOBJCategory)
case publication: Publication => convertPublication(publication, json, typology._1)
case dataset: Dataset => convertDataset(dataset)
}
@ -587,15 +563,7 @@ case object Crossref2Oaf {
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
"10.13039/501100013589" | "10.13039/501100000271" =>
generateSimpleRelationFromAward(funder, "ukri________", a => a)
//HFRI
case "10.13039/501100013209" =>
generateSimpleRelationFromAward(funder, "hfri________", a => a)
val targetId = getProjectId("hfri________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
//ERASMUS+
case "10.13039/501100010790" =>
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
case _ => logger.debug("no match for " + funder.DOI.get)
}
@ -630,12 +598,12 @@ case object Crossref2Oaf {
// TODO check if there are other info to map into the Dataset
}
def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = {
def convertPublication(publication: Publication, json: JValue, cobjCategory: Qualifier): Unit = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
val containerTitles = for { JString(ct) <- json \ "container-title" } yield ct
//Mapping book
if (cobjCategory.toLowerCase.contains("book")) {
if (cobjCategory.getClassname.toLowerCase.contains("book")) {
val ISBN = for { JString(isbn) <- json \ "ISBN" } yield isbn
if (ISBN.nonEmpty && containerTitles.nonEmpty) {
val source = s"${containerTitles.head} ISBN: ${ISBN.head}"
@ -716,12 +684,24 @@ case object Crossref2Oaf {
null
}
def generateItemFromType(objectType: String, objectSubType: String): Result = {
if (mappingCrossrefType.contains(objectType)) {
if (mappingCrossrefType(objectType).equalsIgnoreCase("publication"))
return new Publication()
if (mappingCrossrefType(objectType).equalsIgnoreCase("dataset"))
return new Dataset()
def generateItemFromType(objectType: Qualifier): Result = {
if (objectType.getClassid.equalsIgnoreCase("publication")) {
val item = new Publication
item.setResourcetype(objectType)
return item
} else if (objectType.getClassid.equalsIgnoreCase("dataset")) {
val item = new Dataset
item.setResourcetype(objectType)
return item
}
else if (objectType.getClassid.equalsIgnoreCase("software")){
val item = new Software
item.setResourcetype(objectType)
return item
}else if (objectType.getClassid.equalsIgnoreCase("OtherResearchProduct")){
val item = new OtherResearchProduct
item.setResourcetype(objectType)
return item
}
null
}

View File

@ -1,8 +1,10 @@
package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql._
@ -40,11 +42,17 @@ object SparkMapDumpIntoOAF {
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
val targetPath = parser.get("targetPath")
val isLookupUrl: String = parser.get("isLookupUrl")
logger.info("isLookupUrl: {}", isLookupUrl)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
require(vocabularies != null)
spark.read
.load(parser.get("sourcePath"))
.as[CrossrefDT]
.flatMap(k => Crossref2Oaf.convert(k.json))
.flatMap(k => Crossref2Oaf.convert(k.json, vocabularies))
.filter(o => o != null)
.write
.mode(SaveMode.Overwrite)

View File

@ -0,0 +1,51 @@
package eu.dnetlib.dhp.aggregation;
import static org.mockito.Mockito.lenient;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import org.apache.commons.io.IOUtils;
import org.mockito.Mock;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public abstract class AbstractVocabularyTest {
@Mock
protected ISLookUpService isLookUpService;
protected VocabularyGroup vocabularies;
public void setUpVocabulary() throws ISLookUpException, IOException {
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
lenient()
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
.thenReturn(synonyms());
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
}
private static List<String> vocs() throws IOException {
return IOUtils
.readLines(
Objects
.requireNonNull(
AbstractVocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/doiboost/terms.txt")));
}
private static List<String> synonyms() throws IOException {
return IOUtils
.readLines(
Objects
.requireNonNull(
AbstractVocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/doiboost/synonyms.txt")));
}
}

File diff suppressed because it is too large Load Diff

View File

@ -57,7 +57,7 @@
]
]
},
"type": "posted-content",
"type": "journal-article",
"URL": "http://dx.doi.org/10.1101/030080",
"is-referenced-by-count": 2,
"link": [

View File

@ -1,31 +1,42 @@
package eu.dnetlib.dhp.doiboost.crossref
import com.fasterxml.jackson.databind.SerializationFeature
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.utils.DHPUtils
import eu.dnetlib.doiboost.crossref.Crossref2Oaf
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import com.fasterxml.jackson.databind.ObjectMapper
import org.json4s
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.{DefaultFormats, JValue}
import org.json4s.jackson.JsonMethods
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.{Logger, LoggerFactory}
import java.nio.file.Files
import scala.collection.JavaConverters._
import scala.io.Source
import scala.util.matching.Regex
class CrossrefMappingTest {
@ExtendWith(Array(classOf[MockitoExtension]))
class CrossrefMappingTest extends AbstractVocabularyTest{
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
val mapper = new ObjectMapper()
@BeforeEach
def setUp(): Unit = {
super.setUpVocabulary()
}
@Test
def testMissingAuthorParser():Unit = {
val json: String = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/s41567-022-01757-y.json")).mkString
val result = Crossref2Oaf.convert(json)
val result = Crossref2Oaf.convert(json, vocabularies)
result.filter(o => o.isInstanceOf[Publication]).map(p=> p.asInstanceOf[Publication]).foreach(p =>assertTrue(p.getAuthor.size()>0))
}
@ -45,13 +56,13 @@ class CrossrefMappingTest {
for (line <- funder_doi.linesWithSeparators.map(l => l.stripLineEnd)) {
val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
checkRelation(resultList)
}
for (line <- funder_name.linesWithSeparators.map(l => l.stripLineEnd)) {
val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
checkRelation(resultList)
}
@ -91,7 +102,7 @@ class CrossrefMappingTest {
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Result])
@ -110,14 +121,14 @@ class CrossrefMappingTest {
assertNotNull(json)
assertFalse(json.isEmpty)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Result])
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
items.foreach(p => println(mapper.writeValueAsString(p)))
items.foreach(p => println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(p)))
}
@ -137,7 +148,7 @@ class CrossrefMappingTest {
assertNotNull(json)
assertFalse(json.isEmpty)
val result: List[Oaf] = Crossref2Oaf.convert(json)
val result: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(result.nonEmpty)
@ -158,8 +169,8 @@ class CrossrefMappingTest {
assertEquals(doisReference.size, relationList.size)
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
relationList.foreach(p => println(mapper.writeValueAsString(p)))
relationList.foreach(p => println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(p)))
}
@Test
@ -173,14 +184,14 @@ class CrossrefMappingTest {
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Result])
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
items.foreach(p => println(mapper.writeValueAsString(p)))
items.foreach(p => println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(p)))
}
@ -189,18 +200,17 @@ class CrossrefMappingTest {
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json"))
.mkString
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Result])
items.foreach(p => logger.info(mapper.writeValueAsString(p)))
items.foreach(p => logger.info(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(p)))
}
@ -230,7 +240,7 @@ class CrossrefMappingTest {
assertFalse(json.isEmpty)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
val rels: List[Relation] =
@ -250,7 +260,7 @@ class CrossrefMappingTest {
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -261,7 +271,7 @@ class CrossrefMappingTest {
val result: Result = items.head.asInstanceOf[Result]
assertNotNull(result)
logger.info(mapper.writeValueAsString(result));
logger.info(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(result));
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
@ -326,7 +336,7 @@ class CrossrefMappingTest {
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -410,7 +420,7 @@ class CrossrefMappingTest {
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -458,7 +468,7 @@ class CrossrefMappingTest {
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -537,7 +547,7 @@ class CrossrefMappingTest {
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -563,7 +573,7 @@ class CrossrefMappingTest {
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -573,7 +583,8 @@ class CrossrefMappingTest {
assert(items.size == 1)
val result: Result = items.head.asInstanceOf[Publication]
assertNotNull(result)
logger.info(mapper.writeValueAsString(result));
logger.info(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(result));
}
@Test
@ -586,7 +597,7 @@ class CrossrefMappingTest {
val line: String =
"\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Publication])
val result: Result = items.head.asInstanceOf[Publication]
@ -605,7 +616,7 @@ class CrossrefMappingTest {
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
.mkString
val resultList: List[Oaf] = Crossref2Oaf.convert(template)
val resultList: List[Oaf] = Crossref2Oaf.convert(template, vocabularies)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Publication])
val result: Result = items.head.asInstanceOf[Publication]
@ -629,14 +640,14 @@ class CrossrefMappingTest {
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(item))
assertTrue(
item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor"))
@ -659,7 +670,7 @@ class CrossrefMappingTest {
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -676,8 +687,8 @@ class CrossrefMappingTest {
assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
)
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(item))
}
@ -694,7 +705,7 @@ class CrossrefMappingTest {
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -711,8 +722,7 @@ class CrossrefMappingTest {
assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
)
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(item))
}
@ -729,7 +739,7 @@ class CrossrefMappingTest {
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -746,8 +756,7 @@ class CrossrefMappingTest {
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(item))
}
@ -764,7 +773,7 @@ class CrossrefMappingTest {
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -781,8 +790,8 @@ class CrossrefMappingTest {
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(item))
}
@ -797,7 +806,7 @@ class CrossrefMappingTest {
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
val resultList: List[Oaf] = Crossref2Oaf.convert(json, vocabularies)
assertTrue(resultList.nonEmpty)
@ -807,9 +816,8 @@ class CrossrefMappingTest {
assertEquals(1, item.getInstance().get(0).getUrl().size())
assertEquals(
"https://doi.org/10.1016/j.jas.2019.105013",
item.getInstance().get(0).getUrl().get(0)
item.getInstance().get(0).getUrl.get(0)
)
//println(mapper.writeValueAsString(item))
}

View File

@ -30,7 +30,7 @@ public class MoveResult implements Serializable {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
MoveResult.class
MoveResult.class
.getResourceAsStream(
"/eu/dnetlib/dhp/wf/subworkflows/input_moveresult_parameters.json"));

View File

@ -53,8 +53,6 @@ public class Constraints implements Serializable {
for (Constraint sc : constraint) {
boolean verified = false;
if(!param.containsKey(sc.getField()))
return false;
for (String value : param.get(sc.getField())) {
if (sc.verifyCriteria(value.trim())) {
verified = true;

View File

@ -14,7 +14,6 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
@ -85,26 +84,19 @@ public class SparkCountryPropagationJob {
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
log.info("Reading prepared info: {}", preparedInfoPath);
final Dataset<Row> preparedInfoRaw = spark
Dataset<ResultCountrySet> prepared = spark
.read()
.json(preparedInfoPath);
.json(preparedInfoPath)
.as(Encoders.bean(ResultCountrySet.class));
res
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
if (!preparedInfoRaw.isEmpty()) {
final Dataset<ResultCountrySet> prepared = preparedInfoRaw.as(Encoders.bean(ResultCountrySet.class));
res
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
} else {
res
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
}
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {

View File

@ -317,7 +317,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
listKeyValues(
createOpenaireId(10, rs.getString("collectedfromid"), true),
rs.getString("collectedfromname")));
p.setPid(prepareListOfStructProps(rs.getArray("pid"), info));
p.setPid(new ArrayList<>());
p.setDateofcollection(asString(rs.getDate("dateofcollection")));
p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
p.setExtraInfo(new ArrayList<>()); // Values not present in the DB

View File

@ -33,7 +33,7 @@ SELECT
dc.officialname AS collectedfromname,
p.contracttype || '@@@' || p.contracttypescheme AS contracttype,
p.provenanceactionclass || '@@@' || p.provenanceactionscheme AS provenanceaction,
array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid,
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
array_agg(DISTINCT fp.path) AS fundingtree

View File

@ -33,7 +33,7 @@ SELECT
dc.officialname AS collectedfromname,
p.contracttypeclass || '@@@' || p.contracttypescheme AS contracttype,
p.provenanceactionclass || '@@@' || p.provenanceactionscheme AS provenanceaction,
array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid,
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
array_agg(DISTINCT fp.path) AS fundingtree
FROM projects p

View File

@ -93,8 +93,8 @@ object CopyHdfsOafSparkApplication {
hasSource != null && hasTarget != null
} else {
val hasId = (json \ "id").extractOrElse[String](null)
val resultType = (json \ "resulttype" \ "classid").extractOrElse[String]("")
hasId != null && oafType.startsWith(resultType)
val resultType = (json \ "resulttype" \ "classid").extractOrElse[String](null)
hasId != null && oafType.equalsIgnoreCase(resultType)
}
}

View File

@ -59,19 +59,7 @@ public class CopyHdfsOafSparkApplicationTest {
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/raw/publication_2_unknownProperty.json")),
"publication"));
}
@Test
void isOafType_Datacite_ORP() throws IOException {
assertTrue(
CopyHdfsOafSparkApplication
.isOafType(
IOUtils
.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/raw/datacite_orp.json")),
"otherresearchproduct"));
}
}

View File

@ -1278,6 +1278,21 @@ class MappersTest {
System.out.println("***************");
}
@Test
void testIRISPub() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("iris-odf.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
final Publication p = (Publication) list.get(0);
assertNotNull(p.getInstance().get(0).getUrl().get(0));
assertValidId(p.getId());
System.out.println(p.getInstance().get(0).getUrl());
p.getPid().forEach(x -> System.out.println(x.getValue()));
p.getInstance().get(0).getAlternateIdentifier().forEach(x -> System.out.println(x.getValue()));
}
@Test
void testNotWellFormed() throws IOException {
final String xml = IOUtils

View File

@ -0,0 +1,215 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<oai:header xmlns="http://namespace.openaire.eu/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<identifier>oai:air.unimi.it:2434/907506</identifier>
<datestamp>2024-01-04T12:42:51Z</datestamp>
<setSpec>com_2434_73555</setSpec>
<setSpec>col_2434_73557</setSpec>
<setSpec>openaire</setSpec>
<dr:dateOfTransformation>2024-01-29T16:56:50.632Z</dr:dateOfTransformation>
<dri:objIdentifier>od______1261::ff2d9e058e7bea90a27f41c31078e601</dri:objIdentifier>
<dri:recordIdentifier>oai:air.unimi.it:2434/907506</dri:recordIdentifier>
<dri:dateOfCollection/>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId/>
<oaf:datasourceprefix> od______1261</oaf:datasourceprefix>
</oai:header>
<metadata>
<oaire:resource xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:exslt="http://exslt.org/common"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:rdf="http://www.w3.org/TR/rdf-concepts/"
xmlns:doc="http://www.lyncode.com/xoai"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:vc="http://www.w3.org/2007/XMLSchema-versioning"
xmlns="http://www.openarchives.org/OAI/2.0/"
xsi:schemaLocation="http://namespace.openaire.eu/schema/oaire/ https://www.openaire.eu/schema/repo-lit/4.0/openaire.xsd">
<datacite:titles>
<datacite:title xml:lang="en">Ensuring tests of conservation interventions build on existing literature</datacite:title>
</datacite:titles>
<datacite:creator>
<datacite:creator>
<datacite:creatorName>W.J. Sutherland</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>S.T. Alvarez-Castaneda</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>T. Amano</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>R. Ambrosini</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>P. Atkinson</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>J.M. Baxter</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>A.L. Bond</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>P.J. Boon</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>K.L. Buchanan</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>J. Barlow</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>G. Bogliani</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>O.M. Bragg</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>M. Burgman</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>M.W. Cadotte</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>M. Calver</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>S.J. Cooke</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>R.T. Corlett</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>V. Devictor</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>J.G. Ewen</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>M. Fisher</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>G. Freeman</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>E. Game</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>B.J. Godley</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>C. Gortazar</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>I.R. Hartley</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>D.L. Hawksworth</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>K.A. Hobson</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>M.-. Lu</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>B. Martin-Lopez</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>K. Ma</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>A. Machado</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>D. Mae</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>M. Mangiacotti</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>D.J. Mccafferty</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>V. Melfi</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>S. Molur</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>A.J. Moore</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>S.D. Murphy</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>D. Norri</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>A.P.E. van Oudenhoven</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>J. Power</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>E.C. Ree</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>M.W. Schwartz</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>I. Storch</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>C. Wordley</datacite:creatorName>
</datacite:creator>
</datacite:creator>
<datacite:relatedIdentifiers>
</datacite:relatedIdentifiers>
<datacite:dates>
<datacite:date dateType="Accepted">2020</datacite:date>
<datacite:date dateType="Issued">2020</datacite:date>
<datacite:date dateType="Available">2022-06-20</datacite:date>
</datacite:dates>
<dc:language>eng</dc:language>
<dc:publisher>Wiley Blackwell Publishing</dc:publisher>
<oaire:resourceType resourceTypeGeneral="literature"
uri="http://purl.org/coar/resource_type/c_6501">journal article</oaire:resourceType>
<dc:format>application/pdf</dc:format>
<datacite:identifier xmlns:datacite="http://datacite.org/schema/kernel-3"
identifierType="Handle">2434/907506</datacite:identifier>
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
<datacite:subjects>
<datacite:subject>Conservation of Natural Resources</datacite:subject>
</datacite:subjects>
<datacite:sizes/>
<datacite:sizes/>
<datacite:sizes>
<datacite:size>191802 bytes</datacite:size>
</datacite:sizes>
<oaire:file accessRightsURI="" mimeType="application/pdf" objectType="fulltext">https://air.unimi.it/bitstream/2434/907506/4/Full%20manuscript%20resubmitted.pdf</oaire:file>
</oaire:resource>
<oaf:identifier identifierType="DOI">10.1111/cobi.13555</oaf:identifier>
<oaf:identifier identifierType="PMID">32779884</oaf:identifier>
<oaf:fulltext>https://air.unimi.it/bitstream/2434/907506/4/Full%20manuscript%20resubmitted.pdf</oaf:fulltext>
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
<oaf:dateAccepted>2020-01-01</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:language>eng</oaf:language>
<oaf:hostedBy name="Archivio Istituzionale della Ricerca dell'Università degli Studi di Milano"
id="opendoar____::1261"/>
<oaf:collectedFrom name="Archivio Istituzionale della Ricerca dell'Università degli Studi di Milano"
id="opendoar____::1261"/>
</metadata>
</record>

View File

@ -185,7 +185,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -213,7 +212,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -241,7 +239,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -269,7 +266,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -297,7 +293,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -325,7 +320,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -353,7 +347,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -393,7 +386,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -422,7 +414,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -451,7 +442,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -480,7 +470,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -509,7 +498,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -538,7 +526,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -567,7 +554,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -244,4 +244,27 @@ public class XmlRecordFactoryTest {
}
@Test
public void testIrisGuidelines4() throws DocumentException, IOException {
final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation);
final Publication p = OBJECT_MAPPER
.readValue(
IOUtils.toString(getClass().getResourceAsStream("iris-odf-4.json")),
Publication.class);
final String xml = xmlRecordFactory.build(new JoinedEntity<>(p));
assertNotNull(xml);
final Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc);
System.out.println(doc.asXML());
}
}

View File

@ -1,586 +0,0 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>doi_dedup___::e225555a08a082ad8f53f179bc59c5d0</dri:objIdentifier>
<dri:dateOfCollection>2023-01-27T05:32:10Z</dri:dateOfCollection>
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
<oaf:result>
<collectedfrom name="Crossref" id="openaire____::081b82f96300b6a6e3d282bad31cb6e2"/>
<collectedfrom name="UnpayWall" id="openaire____::8ac8380272269217cb09a928c8caa993"/>
<collectedfrom name="ORCID" id="openaire____::806360c771262b4d6770e7cdf04b5c5a"/>
<collectedfrom name="Microsoft Academic Graph" id="openaire____::5f532a3fc4f1ea403f37070f59a7a53a"/>
<collectedfrom name="Oxford University Research Archive"
id="opendoar____::2290a7385ed77cc5592dc2153229f082"/>
<collectedfrom name="PubMed Central" id="opendoar____::eda80a3d5b344bc40f3bc04f65b7a357"/>
<collectedfrom name="Europe PubMed Central" id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"/>
<originalId>10.1098/rsta.2020.0257</originalId>
<originalId>50|doiboost____::e225555a08a082ad8f53f179bc59c5d0</originalId>
<originalId>3211056089</originalId>
<originalId>50|od______1064::83eb0f76b60445d72bb7428a1b68ef1a</originalId>
<originalId>oai:ora.ox.ac.uk:uuid:9fc4563a-07e1-41d1-8b99-31ce2f8ac027</originalId>
<originalId>50|od_______267::6d978e42c57dfc79d61a84ab5be28cb8</originalId>
<originalId>oai:pubmedcentral.nih.gov:8543046</originalId>
<originalId>od_______267::6d978e42c57dfc79d61a84ab5be28cb8</originalId>
<originalId>34689630</originalId>
<originalId>PMC8543046</originalId>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types">10.1098/rsta.2020.0257
</pid>
<pid classid="pmc" classname="PubMed Central ID" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:crosswalk:repository"
trust="0.9">PMC8543046
</pid>
<pid classid="pmid" classname="PubMed ID" schemeid="dnet:pid_types" schemename="dnet:pid_types"
inferred="false" provenanceaction="sysimport:crosswalk:repository" trust="0.9">34689630
</pid>
<measure id="influence" score="5.534974E-9" class="C"/>
<measure id="popularity" score="2.0811036E-8" class="C"/>
<measure id="influence_alt" score="8" class="C"/>
<measure id="popularity_alt" score="2.736" class="C"/>
<measure id="impulse" score="8" class="C"/>
<measure id="downloads" count="1"/>
<measure id="views" count="4"/>
<title classid="alternative title" classname="alternative title" schemeid="dnet:dataCite_title"
schemename="dnet:dataCite_title">A completely automated pipeline for 3D reconstruction of
human heart from 2D cine magnetic resonance slices.
</title>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
schemename="dnet:dataCite_title">A completely automated pipeline for 3D reconstruction of
human heart from 2D cine magnetic resonance slices
</title>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
schemename="dnet:access_modes"/>
<creator rank="8" URL="https://academic.microsoft.com/#/detail/2103042895">Vicente Grau</creator>
<creator rank="1" orcid="0000-0001-8198-5128"
URL="https://academic.microsoft.com/#/detail/2693349397">Abhirup Banerjee
</creator>
<creator rank="3" URL="https://academic.microsoft.com/#/detail/1425738153">Ernesto Zacur</creator>
<creator rank="6" URL="https://academic.microsoft.com/#/detail/2043191934">Robin P. Choudhury
</creator>
<creator rank="7" URL="https://academic.microsoft.com/#/detail/2166186014">Blanca Rodriguez
</creator>
<creator rank="2" URL="https://academic.microsoft.com/#/detail/2889449215">Julia Camps</creator>
<creator rank="5" URL="https://academic.microsoft.com/#/detail/264390263">Yoram Rudy</creator>
<creator rank="4" URL="https://academic.microsoft.com/#/detail/2713879776">Christopher M. Andrews
</creator>
<country classid="GB" classname="United Kingdom" schemeid="dnet:countries"
schemename="dnet:countries"/>
<dateofacceptance>2021-10-01</dateofacceptance>
<description>Cardiac magnetic resonance (CMR) imaging is a valuable modality in the diagnosis and
characterization of cardiovascular diseases, since it can identify abnormalities in structure
and function of the myocardium non-invasively and without the need for ionizing radiation.
However, in clinical practice, it is commonly acquired as a collection of separated and
independent 2D image planes, which limits its accuracy in 3D analysis. This paper presents a
completely automated pipeline for generating patient-specific 3D biventricular heart models from
cine magnetic resonance (MR) slices. Our pipeline automatically selects the relevant cine MR
images, segments them using a deep learning-based method to extract the heart contours, and
aligns the contours in 3D space correcting possible misalignments due to breathing or subject
motion first using the intensity and contours information from the cine data and next with the
help of a statistical shape model. Finally, the sparse 3D representation of the contours is used
to generate a smooth 3D biventricular mesh. The computational pipeline is applied and evaluated
in a CMR dataset of 20 healthy subjects. Our results show an average reduction of misalignment
artefacts from 1.82±1.60mm to 0.72±0.73mm over 20 subjects, in terms of distance from the
final reconstructed mesh. The high-resolution 3D biventricular meshes obtained with our
computational pipeline are used for simulations of electrical activation patterns, showing
agreement with non-invasive electrocardiographic imaging. The automatic methodologies presented
here for patient-specific MR imaging-based 3D biventricular representations contribute to the
efficient realization of precision medicine, enabling the enhanced interpretability of clinical
data, the digital twin vision through patient-specific image-based modelling and simulation, and
augmented reality applications.
This article is part of the theme issue Advanced computation in cardiovascular physiology: new
challenges and opportunities.
</description>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">General Physics and Astronomy
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">General Engineering
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">General Mathematics
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Pipeline (computing)
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Cine mri
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Structure and function
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Cardiac magnetic resonance
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Magnetic resonance imaging
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:actionset" trust="0.5338496">medicine.diagnostic_test
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:actionset" trust="0.5338496">medicine
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Human heart
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Modality (humancomputer interaction)
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">3D reconstruction
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Computer science
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Nuclear magnetic resonance
</subject>
<subject classid="SDG" classname="Sustainable Development Goals"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:sdg">3. Good health
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">03 medical and health sciences
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">0302 clinical medicine
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">030218 Nuclear Medicine &amp;
Medical Imaging
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">03021801 Radiology/Image
segmentation
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">03021801 Radiology/Image
segmentation - deep learning/datum
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">030204 Cardiovascular System
&amp; Hematology
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">03020401 Aging-associated
diseases/Heart diseases
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">030217 Neurology &amp;
Neurosurgery
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">03021701 Brain/Neural circuits
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">Articles
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">Research Articles
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">cardiac mesh reconstruction
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">cine MRI
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">misalignment correction
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">electrophysiological
simulation
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">ECGI
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9">Heart
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9">Humans
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9">Imaging, Three-Dimensional
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9">Magnetic Resonance Imaging
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9">Magnetic Resonance Imaging, Cine
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9">Magnetic Resonance Spectroscopy
</subject>
<language classid="und" classname="Undetermined" schemeid="dnet:languages"
schemename="dnet:languages"/>
<relevantdate classid="created" classname="created" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date">2021-10-25
</relevantdate>
<relevantdate classid="published-online" classname="published-online" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date">2021-10-25
</relevantdate>
<relevantdate classid="published-print" classname="published-print" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date">2021-12-13
</relevantdate>
<relevantdate classid="UNKNOWN" classname="UNKNOWN" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">2021-01-01
</relevantdate>
<relevantdate classid="available" classname="available" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">2023-01-05
</relevantdate>
<relevantdate classid="Accepted" classname="Accepted" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">2021-05-28
</relevantdate>
<relevantdate classid="issued" classname="issued" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">2023-01-05
</relevantdate>
<publisher>The Royal Society</publisher>
<source>Crossref</source>
<source/>
<source>Philosophical transactions. Series A, Mathematical, physical, and engineering sciences
</source>
<resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies"
schemename="dnet:result_typologies"/>
<resourcetype classid="UNKNOWN" classname="UNKNOWN" schemeid="dnet:dataCite_resource"
schemename="dnet:dataCite_resource"/>
<journal issn="1364-503X" eissn="1471-2962" vol="379">Philosophical Transactions of the Royal
Society A: Mathematical, Physical and Engineering Sciences
</journal>
<context id="dth" label="Digital Twins in Health" type="community"></context>
<context id="EC" label="European Commission" type="funding">
<category id="EC::H2020" label="Horizon 2020 Framework Programme">
<concept id="EC::H2020::RIA" label="Research and Innovation action"/>
</category>
</context>
<datainfo>
<inferred>true</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.8</trust>
<inferenceprovenance>dedup-result-decisiontree-v3</inferenceprovenance>
<provenanceaction classid="sysimport:dedup" classname="Inferred by OpenAIRE"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</datainfo>
<rels>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="sysimport:actionset">
<to class="hasAuthorInstitution" scheme="dnet:result_organization_relations"
type="organization">openorgs____::6a7b1b4c40a067a1f209de6867fe094d
</to>
<country classid="GB" classname="United Kingdom" schemeid="dnet:countries"
schemename="dnet:countries"/>
<legalname>University of Oxford</legalname>
<legalshortname>University of Oxford</legalshortname>
</rel>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="sysimport:actionset">
<to class="IsSupplementedBy" scheme="dnet:result_result_relations" type="publication">
doi_dedup___::015b27b0b7c55649236bf23a5c75f817
</to>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
trust="0.9">10.6084/m9.figshare.15656924.v2
</pid>
<dateofacceptance>2021-01-01</dateofacceptance>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
schemename="dnet:dataCite_title">Implementation Details of the Reconstruction
Pipeline and Electrophysiological Inference Results from A completely automated pipeline
for 3D reconstruction of human heart from 2D cine magnetic resonance slices
</title>
<publisher>The Royal Society</publisher>
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
trust="0.9">10.6084/m9.figshare.15656924
</pid>
</rel>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="sysimport:actionset">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">
corda__h2020::27f89b49dee12d828cc0f90f51727204
</to>
<code>823712</code>
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission"
jurisdiction="EU"/>
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
<funding_level_1 name="RIA">ec__________::EC::H2020::RIA</funding_level_1>
</funding>
<acronym>CompBioMed2</acronym>
<title>A Centre of Excellence in Computational Biomedicine</title>
</rel>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="sysimport:actionset">
<to class="IsSupplementedBy" scheme="dnet:result_result_relations" type="dataset">
doi_dedup___::be1ef3b30a8d7aa7e4dfe1570d5febf7
</to>
<dateofacceptance>2021-01-01</dateofacceptance>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
trust="0.9">10.6084/m9.figshare.15656927
</pid>
<publisher>The Royal Society</publisher>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
trust="0.9">10.6084/m9.figshare.15656927.v1
</pid>
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
schemename="dnet:dataCite_title">Montage Video of the Stepwise Performance of 3D
Reconstruction Pipeline on All 20 Patients from A completely automated pipeline for 3D
reconstruction of human heart from 2D cine magnetic resonance slices
</title>
</rel>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="sysimport:actionset">
<to class="IsSupplementedBy" scheme="dnet:result_result_relations" type="dataset">
doi_________::9f9f2328e11d379b14cb888209e33088
</to>
<dateofacceptance>2021-01-01</dateofacceptance>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
trust="0.9">10.6084/m9.figshare.15656924.v1
</pid>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
schemename="dnet:dataCite_title">Implementation Details of the Reconstruction
Pipeline and Electrophysiological Inference Results from A completely automated pipeline
for 3D reconstruction of human heart from 2D cine magnetic resonance slices
</title>
<publisher>The Royal Society</publisher>
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
</rel>
</rels>
<children>
<result objidentifier="pmid________::9c855c9c4a8018df23f9d51f879b62b1">
<dateofacceptance>2021-10-01</dateofacceptance>
<pid classid="pmid" classname="PubMed ID" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">34689630
</pid>
<collectedfrom name="PubMed Central" id="opendoar____::eda80a3d5b344bc40f3bc04f65b7a357"/>
<pid classid="pmid" classname="PubMed ID" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
trust="0.9">34689630
</pid>
<publisher>The Royal Society</publisher>
<collectedfrom name="Europe PubMed Central"
id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"/>
<pid classid="pmc" classname="PubMed Central ID" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">PMC8543046
</pid>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
schemename="dnet:dataCite_title" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">A completely automated
pipeline for 3D reconstruction of human heart from 2D cine magnetic resonance slices
</title>
<pid classid="pmc" classname="PubMed Central ID" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
trust="0.9">PMC8543046
</pid>
</result>
<result objidentifier="od______1064::83eb0f76b60445d72bb7428a1b68ef1a">
<dateofacceptance>2023-01-05</dateofacceptance>
<publisher>Royal Society</publisher>
<collectedfrom name="Oxford University Research Archive"
id="opendoar____::2290a7385ed77cc5592dc2153229f082"/>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
schemename="dnet:dataCite_title" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">A completely automated
pipeline for 3D reconstruction of human heart from 2D cine magnetic resonance slices
</title>
</result>
<result objidentifier="doi_________::e225555a08a082ad8f53f179bc59c5d0">
<dateofacceptance>2021-10-25</dateofacceptance>
<publisher>The Royal Society</publisher>
<collectedfrom name="Crossref" id="openaire____::081b82f96300b6a6e3d282bad31cb6e2"/>
<title classid="alternative title" classname="alternative title"
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">A completely
automated pipeline for 3D reconstruction of human heart from 2D cine magnetic resonance
slices.
</title>
<collectedfrom name="UnpayWall" id="openaire____::8ac8380272269217cb09a928c8caa993"/>
<collectedfrom name="ORCID" id="openaire____::806360c771262b4d6770e7cdf04b5c5a"/>
<collectedfrom name="Microsoft Academic Graph"
id="openaire____::5f532a3fc4f1ea403f37070f59a7a53a"/>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types">10.1098/rsta.2020.0257
</pid>
</result>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
schemename="dnet:access_modes"/>
<collectedfrom name="Oxford University Research Archive"
id="opendoar____::2290a7385ed77cc5592dc2153229f082"/>
<hostedby name="Oxford University Research Archive"
id="opendoar____::2290a7385ed77cc5592dc2153229f082"/>
<dateofacceptance>2023-01-05</dateofacceptance>
<instancetype classid="0038" classname="Other literature type"
schemeid="dnet:publication_resource" schemename="dnet:publication_resource"/>
<alternateidentifier classid="pmid" classname="PubMed ID" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">34689630
</alternateidentifier>
<alternateidentifier classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">
10.1098/rsta.2020.0257
</alternateidentifier>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
schemename="dnet:review_levels"/>
<license>http://creativecommons.org/licenses/by/4.0/</license>
<webresource>
<url>https://ora.ox.ac.uk/objects/uuid:9fc4563a-07e1-41d1-8b99-31ce2f8ac027</url>
</webresource>
</instance>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
schemename="dnet:access_modes"/>
<collectedfrom name="UnpayWall" id="openaire____::8ac8380272269217cb09a928c8caa993"/>
<hostedby
name="Philosophical Transactions of the Royal Society A Mathematical Physical and Engineering Sciences"
id="issn___print::c72e72d9860ed6d9ec013e20c9421ba0"/>
<instancetype classid="0001" classname="Article" schemeid="dnet:publication_resource"
schemename="dnet:publication_resource"/>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types">10.1098/rsta.2020.0257
</pid>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
schemename="dnet:review_levels"/>
<webresource>
<url>https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8543046</url>
</webresource>
</instance>
<instance>
<accessright classid="CLOSED" classname="Closed Access" schemeid="dnet:access_modes"
schemename="dnet:access_modes"/>
<collectedfrom name="Crossref" id="openaire____::081b82f96300b6a6e3d282bad31cb6e2"/>
<hostedby
name="Philosophical Transactions of the Royal Society A Mathematical Physical and Engineering Sciences"
id="issn___print::c72e72d9860ed6d9ec013e20c9421ba0"/>
<dateofacceptance>2021-10-25</dateofacceptance>
<instancetype classid="0001" classname="Article" schemeid="dnet:publication_resource"
schemename="dnet:publication_resource"/>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types">10.1098/rsta.2020.0257
</pid>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
schemename="dnet:review_levels"/>
<license>https://royalsociety.org/journals/ethics-policies/data-sharing-mining/</license>
<webresource>
<url>https://doi.org/10.1098/rsta.2020.0257</url>
</webresource>
</instance>
<instance>
<accessright classid="UNKNOWN" classname="not available" schemeid="dnet:access_modes"
schemename="dnet:access_modes"/>
<collectedfrom name="Europe PubMed Central"
id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"/>
<hostedby name="Unknown Repository" id="openaire____::55045bd2a65019fd8e6741a755395c8c"/>
<dateofacceptance>2021-10-25</dateofacceptance>
<instancetype classid="0001" classname="Article" schemeid="dnet:publication_resource"
schemename="dnet:publication_resource"/>
<pid classid="pmid" classname="PubMed ID" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
trust="0.9">34689630
</pid>
<pid classid="pmc" classname="PubMed Central ID" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
trust="0.9">PMC8543046
</pid>
<alternateidentifier classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9">
10.1098/rsta.2020.0257
</alternateidentifier>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
schemename="dnet:review_levels"/>
<webresource>
<url>https://pubmed.ncbi.nlm.nih.gov/34689630</url>
</webresource>
</instance>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
schemename="dnet:access_modes"/>
<collectedfrom name="PubMed Central" id="opendoar____::eda80a3d5b344bc40f3bc04f65b7a357"/>
<hostedby name="Europe PubMed Central" id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"/>
<dateofacceptance>2021-10-01</dateofacceptance>
<instancetype classid="0001" classname="Article" schemeid="dnet:publication_resource"
schemename="dnet:publication_resource"/>
<pid classid="pmid" classname="PubMed ID" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">34689630
</pid>
<pid classid="pmc" classname="PubMed Central ID" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">PMC8543046
</pid>
<alternateidentifier classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">
10.1098/rsta.2020.0257
</alternateidentifier>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
schemename="dnet:review_levels"/>
<webresource>
<url>http://europepmc.org/articles/PMC8543046</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -1,238 +0,0 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>doi_dedup___::88a9861b26cdda1c3dd162d11f0bedbe</dri:objIdentifier>
<dri:dateOfCollection>2023-03-13T00:12:27+0000</dri:dateOfCollection>
<dri:dateOfTransformation>2023-03-13T04:39:52.807Z</dri:dateOfTransformation>
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
<oaf:result>
<collectedfrom name="NARCIS" id="openaire____::fdb035c8b3e0540a8d9a561a6c44f4de" />
<collectedfrom name="Crossref" id="openaire____::081b82f96300b6a6e3d282bad31cb6e2" />
<collectedfrom name="UnpayWall" id="openaire____::8ac8380272269217cb09a928c8caa993" />
<collectedfrom name="ORCID" id="openaire____::806360c771262b4d6770e7cdf04b5c5a" />
<collectedfrom name="Microsoft Academic Graph" id="openaire____::5f532a3fc4f1ea403f37070f59a7a53a" />
<collectedfrom name="Europe PubMed Central" id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c" />
<collectedfrom name="NARCIS" id="eurocrisdris::fe4903425d9040f680d8610d9079ea14" />
<originalId>oai:pure.eur.nl:publications/70bf9bd0-5ea6-45fd-bb62-8d79b48cd69f</originalId>
<originalId>50|narcis______::3df5b8b060b819af0d439dd6751c8a77</originalId>
<originalId>10.2196/33081</originalId>
<originalId>50|doiboost____::88a9861b26cdda1c3dd162d11f0bedbe</originalId>
<originalId>3212148341</originalId>
<originalId>od_______267::276eb3ebee07cf1f3e8bfc43926fd0c2</originalId>
<originalId>35099399</originalId>
<originalId>PMC8844982</originalId>
<originalId>oai:services.nod.dans.knaw.nl:Publications/eur:oai:pure.eur.nl:publications/70bf9bd0-5ea6-45fd-bb62-8d79b48cd69f</originalId>
<originalId>50|dris___00893::107e97e645cbb06fb7b454ce2569d6c2</originalId>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.2196/33081</pid>
<pid classid="pmid" classname="PubMed ID" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">35099399</pid>
<pid classid="pmc" classname="PubMed Central ID" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">PMC8844982</pid>
<measure id="influence" score="4.9560573E-9" class="C" />
<measure id="popularity" score="1.6705286E-8" class="C" />
<measure id="influence_alt" score="6" class="C" />
<measure id="popularity_alt" score="2.16" class="C" />
<measure id="impulse" score="6" class="C" />
<title classid="alternative title" classname="alternative title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Mapping the Ethical Issues of Digital Twins for Personalised Healthcare Service (Preprint)</title>
<title classid="subtitle" classname="subtitle" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title" inferred="false" provenanceaction="sysimport:crosswalk:datasetarchive" trust="0.9">Preliminary Mapping Study</title>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title" inferred="false" provenanceaction="sysimport:crosswalk:repository" trust="0.9">Ethical Issues of Digital Twins for Personalized Health Care Service: Preliminary Mapping Study</title>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<creator rank="2" URL="https://academic.microsoft.com/#/detail/3211925536" orcid="0000-0001-9882-0389">Ki-hun Kim</creator>
<creator rank="1" URL="https://academic.microsoft.com/#/detail/3214669734" orcid="0000-0001-5334-8615">Pei-hua Huang</creator>
<creator rank="3" URL="https://academic.microsoft.com/#/detail/3214611946" orcid="0000-0003-4283-9659">Maartje Schermer</creator>
<contributor>Public Health</contributor>
<country classid="NL" classname="Netherlands" schemeid="dnet:countries" schemename="dnet:countries" />
<dateofacceptance>2022-01-01</dateofacceptance>
<description>Background: The concept of digital twins has great potential for transforming the existing health care system by making it more personalized. As a convergence of health care, artificial intelligence, and information and communication technologies, personalized health care services that are developed under the concept of digital twins raise a myriad of ethical issues. Although some of the ethical issues are known to researchers working on digital health and personalized medicine, currently, there is no comprehensive review that maps the major ethical risks of digital twins for personalized health care services. Objective This study aims to fill the research gap by identifying the major ethical risks of digital twins for personalized health care services. We first propose a working definition for digital twins for personalized health care services to facilitate future discussions on the ethical issues related to these emerging digital health services. We then develop a process-oriented ethical map to identify the major ethical risks in each of the different data processing phases. MethodsWe resorted to the literature on eHealth, personalized medicine, precision medicine, and information engineering to identify potential issues and developed a process-oriented ethical map to structure the inquiry in a more systematic way. The ethical map allows us to see how each of the major ethical concerns emerges during the process of transforming raw data into valuable information. Developers of a digital twin for personalized health care service may use this map to identify ethical risks during the development stage in a more systematic way and can proactively address them. ResultsThis paper provides a working definition of digital twins for personalized health care services by identifying 3 features that distinguish the new application from other eHealth services. On the basis of the working definition, this paper further layouts 10 major operational problems and the corresponding ethical risks. ConclusionsIt is challenging to address all the major ethical risks that a digital twin for a personalized health care service might encounter proactively without a conceptual map at hand. The process-oriented ethical map we propose here can assist the developers of digital twins for personalized health care services in analyzing ethical risks in a more systematic manner. </description>
<subject classid="mesh" classname="Medical Subject Headings" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="true" inferenceprovenance="iis::document_classes" provenanceaction="iis" trust="0.891">education</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Health Informatics</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Ethical issues</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Healthcare service</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Psychology</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Internet privacy</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="false" provenanceaction="sysimport:actionset" trust="0.4395309">business.industry</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="false" provenanceaction="sysimport:actionset" trust="0.4395309">business</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Preprint</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">Artificial Intelligence</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">Delivery of Health Care</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">Health Services</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">Humans</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">Precision Medicine</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">Telemedicine</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">03 medical and health sciences</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">0302 clinical medicine
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">030212 General &amp; Internal Medicine
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">03021201 Health care/Health care quality - datum/health care
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">03021201 Health care/Health care quality - datum/electronic health
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">0301 basic medicine
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">030104 Developmental Biology
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">0303 health sciences
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">030304 Developmental Biology
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">030304 Developmental Biology - datum/datum management/ethical
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">06 humanities and the arts</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">0603 philosophy, ethics and religion
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">060301 Applied Ethics
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">06030101 Bioethics/Coordinates on Wikidata - intelligence/artificial intelligence/ethical
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">06030101 Bioethics/Coordinates on Wikidata - datum/ethical
</subject>
<language classid="eng" classname="English" schemeid="dnet:languages" schemename="dnet:languages" />
<relevantdate classid="created" classname="created" schemeid="dnet:dataCite_date" schemename="dnet:dataCite_date">2021-11-17</relevantdate>
<relevantdate classid="published-online" classname="published-online" schemeid="dnet:dataCite_date" schemename="dnet:dataCite_date">2022-01-31</relevantdate>
<relevantdate classid="issued" classname="issued" schemeid="dnet:dataCite_date" schemename="dnet:dataCite_date" inferred="false" provenanceaction="sysimport:crosswalk:datasetarchive" trust="0.9">2022-01-01</relevantdate>
<source>Crossref</source>
<source />
<source>Journal of Medical Internet Research, 24(1):e33081. Journal of medical Internet Research</source>
<source>urn:issn:1439-4456</source>
<source>VOLUME=24;ISSUE=1;ISSN=1439-4456;TITLE=Journal of Medical Internet Research</source>
<format>application/pdf</format>
<resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
<resourcetype classid="0001" classname="0001" schemeid="dnet:dataCite_resource" schemename="dnet:dataCite_resource" />
<context id="knowmad" label="Knowmad Institut" type="community" />
<context id="dth" label="Digital Twins in Health" type="community"/>
<datainfo>
<inferred>true</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.8</trust>
<inferenceprovenance>dedup-result-decisiontree-v3</inferenceprovenance>
<provenanceaction classid="sysimport:dedup" classname="Inferred by OpenAIRE" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
</datainfo>
<rels />
<children>
<result objidentifier="narcis______::3df5b8b060b819af0d439dd6751c8a77">
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title" inferred="false" provenanceaction="sysimport:crosswalk:repository" trust="0.9">Ethical Issues of Digital Twins for Personalized Health Care Service: Preliminary Mapping Study</title>
<collectedfrom name="NARCIS" id="openaire____::fdb035c8b3e0540a8d9a561a6c44f4de" />
<dateofacceptance>2022-01-01</dateofacceptance>
</result>
<result objidentifier="pmid________::70f4213809e44f787bf9f6ffa50ff98f">
<pid classid="pmc" classname="PubMed Central ID" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">PMC8844982</pid>
<collectedfrom name="Europe PubMed Central" id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c" />
<dateofacceptance>2021-08-23</dateofacceptance>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">Ethical Issues of Digital Twins for Personalized Health Care Service: Preliminary Mapping Study.</title>
<pid classid="pmid" classname="PubMed ID" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">35099399</pid>
</result>
<result objidentifier="dris___00893::107e97e645cbb06fb7b454ce2569d6c2">
<title classid="subtitle" classname="subtitle" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title" inferred="false" provenanceaction="sysimport:crosswalk:datasetarchive" trust="0.9">Preliminary Mapping Study</title>
<dateofacceptance>2022-01-01</dateofacceptance>
<collectedfrom name="NARCIS" id="eurocrisdris::fe4903425d9040f680d8610d9079ea14" />
</result>
<result objidentifier="doi_________::88a9861b26cdda1c3dd162d11f0bedbe">
<dateofacceptance>2022-01-31</dateofacceptance>
<title classid="alternative title" classname="alternative title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Mapping the Ethical Issues of Digital Twins for Personalised Healthcare Service (Preprint)</title>
<collectedfrom name="Crossref" id="openaire____::081b82f96300b6a6e3d282bad31cb6e2" />
<collectedfrom name="UnpayWall" id="openaire____::8ac8380272269217cb09a928c8caa993" />
<collectedfrom name="ORCID" id="openaire____::806360c771262b4d6770e7cdf04b5c5a" />
<collectedfrom name="Microsoft Academic Graph" id="openaire____::5f532a3fc4f1ea403f37070f59a7a53a" />
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.2196/33081</pid>
<publisher>JMIR Publications Inc.</publisher>
</result>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<collectedfrom name="Europe PubMed Central" id="opendoar____::8b6dd7db9af49e67306feb59a8bdc52c" />
<hostedby name="Journal of Medical Internet Research" id="doajarticles::972bc6f95b43b5ad42f75695b1d8e948" />
<dateofacceptance>2021-08-23</dateofacceptance>
<instancetype classid="0001" classname="Article" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<pid classid="pmid" classname="PubMed ID" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">35099399</pid>
<pid classid="pmc" classname="PubMed Central ID" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">PMC8844982</pid>
<alternateidentifier classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.2196/33081</alternateidentifier>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels" schemename="dnet:review_levels" />
<webresource>
<url>https://pubmed.ncbi.nlm.nih.gov/35099399</url>
</webresource>
</instance>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<collectedfrom name="Crossref" id="openaire____::081b82f96300b6a6e3d282bad31cb6e2" />
<collectedfrom name="NARCIS" id="eurocrisdris::fe4903425d9040f680d8610d9079ea14" />
<hostedby name="Journal of Medical Internet Research" id="doajarticles::972bc6f95b43b5ad42f75695b1d8e948" />
<dateofacceptance>2022-01-01</dateofacceptance>
<dateofacceptance>2022-01-31</dateofacceptance>
<instancetype classid="0001" classname="Article" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.2196/33081</pid>
<alternateidentifier classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:crosswalk:datasetarchive" trust="0.9">10.2196/33081</alternateidentifier>
<alternateidentifier classid="urn" classname="urn" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:crosswalk:datasetarchive" trust="0.9">urn:nbn:nl:ui:15-70bf9bd0-5ea6-45fd-bb62-8d79b48cd69f</alternateidentifier>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels" schemename="dnet:review_levels" />
<webresource>
<url>https://doi.org/10.2196/33081</url>
</webresource>
</instance>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<collectedfrom name="NARCIS" id="openaire____::fdb035c8b3e0540a8d9a561a6c44f4de" />
<hostedby name="NARCIS" id="openaire____::fdb035c8b3e0540a8d9a561a6c44f4de" />
<dateofacceptance>2022-01-01</dateofacceptance>
<instancetype classid="0001" classname="Article" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<alternateidentifier classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:crosswalk:repository" trust="0.9">10.2196/33081</alternateidentifier>
<alternateidentifier classid="urn" classname="urn" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:crosswalk:repository" trust="0.9">urn:nbn:nl:ui:15-70bf9bd0-5ea6-45fd-bb62-8d79b48cd69f</alternateidentifier>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels" schemename="dnet:review_levels" />
<webresource>
<url>https://pure.eur.nl/en/publications/70bf9bd0-5ea6-45fd-bb62-8d79b48cd69f</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -1,223 +0,0 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>doi_________::c166d06aeaed817937a79a400906a4b9</dri:objIdentifier>
<dri:dateOfCollection>2023-03-09T00:12:02.045Z</dri:dateOfCollection>
<dri:dateOfTransformation>2023-03-09T00:24:00.8Z</dri:dateOfTransformation>
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
<oaf:result>
<collectedfrom name="OpenAPC Global Initiative"
id="apc_________::e2b1600b229fc30663c8a1f662debddf"/>
<collectedfrom name="Crossref" id="openaire____::081b82f96300b6a6e3d282bad31cb6e2"/>
<collectedfrom name="UnpayWall" id="openaire____::8ac8380272269217cb09a928c8caa993"/>
<collectedfrom name="ORCID" id="openaire____::806360c771262b4d6770e7cdf04b5c5a"/>
<collectedfrom name="Microsoft Academic Graph" id="openaire____::5f532a3fc4f1ea403f37070f59a7a53a"/>
<originalId>50|openapc_____::c166d06aeaed817937a79a400906a4b9</originalId>
<originalId>10.3390/pr9111967</originalId>
<originalId>pr9111967</originalId>
<originalId>50|doiboost____::c166d06aeaed817937a79a400906a4b9</originalId>
<originalId>3209532762</originalId>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:datasetarchive" trust="0.9">10.3390/pr9111967
</pid>
<measure id="influence" score="5.187591E-9" class="C"/>
<measure id="popularity" score="2.4479982E-8" class="C"/>
<measure id="influence_alt" score="10" class="C"/>
<measure id="popularity_alt" score="3.6" class="C"/>
<measure id="impulse" score="10" class="C"/>
<title classid="alternative title" classname="alternative title" schemeid="dnet:dataCite_title"
schemename="dnet:dataCite_title">Digital Twins for Continuous mRNA Production
</title>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
schemename="dnet:access_modes"/>
<dateofacceptance>2021-11-04</dateofacceptance>
<description>The global coronavirus pandemic continues to restrict public life worldwide. An
effective means of limiting the pandemic is vaccination. Messenger ribonucleic acid (mRNA)
vaccines currently available on the market have proven to be a well-tolerated and effective
class of vaccine against coronavirus type 2 (CoV2). Accordingly, demand is presently
outstripping mRNA vaccine production. One way to increase productivity is to switch from the
currently performed batch to continuous in vitro transcription, which has proven to be a crucial
material-consuming step. In this article, a physico-chemical model of in vitro mRNA
transcription in a tubular reactor is presented and compared to classical batch and continuous
in vitro transcription in a stirred tank. The three models are validated based on a distinct and
quantitative validation workflow. Statistically significant parameters are identified as part of
the parameter determination concept. Monte Carlo simulations showed that the model is precise,
with a deviation of less than 1%. The advantages of continuous production are pointed out
compared to batchwise in vitro transcription by optimization of the spacetime yield.
Improvements of a factor of 56 (0.011 µM/min) in the case of the continuously stirred tank
reactor (CSTR) and 68 (0.013 µM/min) in the case of the plug flow reactor (PFR) were found.
</description>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Process Chemistry and Technology
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Chemical Engineering (miscellaneous)
</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Bioengineering
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Coronavirus
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:actionset" trust="0.4019353">medicine.disease_cause
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:actionset" trust="0.4019353">medicine
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Continuous production
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Messenger RNA
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Continuous stirred-tank reactor
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Plug flow reactor model
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Mathematics
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">In vitro transcription
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Biological system
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Yield (chemistry)
</subject>
<subject classid="MAG" classname="Microsoft Academic Graph classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies">Public life
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">03 medical and health sciences
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">0301 basic medicine
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">030104 Developmental Biology
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">0303 health sciences
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">030304 Developmental Biology
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">02 engineering and technology
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">0210 nano-technology
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">021001 Nanoscience &amp;
Nanotechnology
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">01 natural sciences
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">0104 chemical sciences
</subject>
<subject classid="FOS" classname="Fields of Science and Technology classification"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="true"
inferenceprovenance="update" provenanceaction="subject:fos">010405 Organic Chemistry
</subject>
<language classid="UNKNOWN" classname="UNKNOWN" schemeid="dnet:languages"
schemename="dnet:languages"/>
<relevantdate classid="created" classname="created" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date">2021-11-05
</relevantdate>
<relevantdate classid="published-online" classname="published-online" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date">2021-11-04
</relevantdate>
<source>Crossref</source>
<source/>
<resulttype classid="publication" classname="publication" schemeid="dnet:result_typologies"
schemename="dnet:result_typologies"/>
<resourcetype classid="0001" classname="Article" schemeid="dnet:publication_resource"
schemename="dnet:publication_resource"/>
<processingchargeamount>2146.08</processingchargeamount>
<processingchargecurrency>EUR</processingchargecurrency>
<journal issn="2227-9717">Processes</journal>
<context id="covid-19" label="COVID-19" type="community"/>
<context id="dth" label="Digital Twins in Health" type="community"/>
<datainfo>
<inferred>false</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.9</trust>
<inferenceprovenance>null</inferenceprovenance>
<provenanceaction classid="sysimport:actionset" classname="Harvested"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</datainfo>
<rels/>
<children>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
schemename="dnet:access_modes"/>
<collectedfrom name="Crossref" id="openaire____::081b82f96300b6a6e3d282bad31cb6e2"/>
<hostedby name="Processes" id="doajarticles::9c4841602dcae937ac9c86cfb03c5ee1"/>
<dateofacceptance>2021-11-04</dateofacceptance>
<instancetype classid="0001" classname="Article" schemeid="dnet:publication_resource"
schemename="dnet:publication_resource"/>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types">10.3390/pr9111967
</pid>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
schemename="dnet:review_levels"/>
<license>https://creativecommons.org/licenses/by/4.0/</license>
<webresource>
<url>https://doi.org/10.3390/pr9111967</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -1,138 +0,0 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>doi_dedup___::10a910f4a66b7f4bce8407d7a486a80a</dri:objIdentifier>
<dri:dateOfCollection>2023-04-05T00:36:27+0000</dri:dateOfCollection>
<dri:dateOfTransformation>2023-04-05T07:33:52.185Z</dri:dateOfTransformation>
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
<oaf:result>
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254" />
<originalId>50|datacite____::10a910f4a66b7f4bce8407d7a486a80a</originalId>
<originalId>10.5281/zenodo.6967373</originalId>
<originalId>50|datacite____::172969c66c312a9656fc745f0ec62ce5</originalId>
<originalId>10.5281/zenodo.6969999</originalId>
<originalId>50|datacite____::4fa8f1c89ff11e8e99f9ded870ade80d</originalId>
<originalId>10.5281/zenodo.6967372</originalId>
<originalId>50|datacite____::a466b6173773d742b7a5881682748a8c</originalId>
<originalId>10.5281/zenodo.6970067</originalId>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6967373</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6969999</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6967372</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6970067</pid>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Sentinel-3 NDVI ARD and Long Term Statistics (1999-2019) from the Copernicus Global Land Service over Lombardia</title>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<creator rank="1">Marasco Pier Lorenzo</creator>
<dateofacceptance>2022-08-05</dateofacceptance>
<description>Sentinel-3 NDVI Analysis Ready Data (ARD) (C_GLS_NDVI_20220101_20220701_Lombardia_S3_2.nc) product provided by the Copernicus Global Land Service [3]. The file C_GLS_NDVI_20220101_20220701_Lombardia_S3_2_masked.nc is derived from C_GLS_NDVI_20220101_20220701_Lombardia_S3_2.nc but values have been scaled (raw_value * ( 1/250) - 0.08) and values lower then -0.08 and greater than 0.92 have been removed (set to missing values). The original dataset can also be discovered through the OpenEO API[5] from the CGLS distributor VITO [4]. Access is free of charge but an EGI registration is needed. The file called Italy.geojson has been created using the Global Administrative Unit Layers GAUL G2015_2014 provided by FAO-UN (see Documentation). It only contains information related to Italy. Further info about drought indexes can be found in the Integrated Drought Management Programme [5] [1] Application of vegetation index and brightness temperature for drought detection [2] NDVI [3] Copernicus Global Land Service [4] Vito [5] OpenEO [5] Integrated Drought Management</description>
<description>These datasets are used for training purposes. See https://pangeo-data.github.io/foss4g-2022/intro.html</description>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">NDVI</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">vegetaion</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">Copernicus Global Land Service</subject>
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">pangeo</subject>
<language classid="eng" classname="English" schemeid="dnet:languages" schemename="dnet:languages" />
<relevantdate classid="issued" classname="issued" schemeid="dnet:dataCite_date" schemename="dnet:dataCite_date">2022-08-05</relevantdate>
<publisher>Zenodo</publisher>
<resulttype classid="dataset" classname="dataset" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
<resourcetype classid="UNKNOWN" classname="Unknown" schemeid="dnet:dataCite_resource" schemename="dnet:dataCite_resource" />
<eoscifguidelines code="EOSC::Jupyter Notebook" label="EOSC::Jupyter Notebook" semanticrelation="compliesWith" />
<datainfo>
<inferred>true</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.8</trust>
<inferenceprovenance>dedup-result-decisiontree-v3</inferenceprovenance>
<provenanceaction classid="sysimport:dedup" classname="Inferred by OpenAIRE" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
</datainfo>
<rels></rels>
<children>
<result objidentifier="doi_________::4fa8f1c89ff11e8e99f9ded870ade80d">
<publisher>Zenodo</publisher>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6967372</pid>
<dateofacceptance>2022-08-05</dateofacceptance>
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254" />
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Sentinel-3 NDVI ARD and Long Term Statistics (1999-2019) from the Copernicus Global Land Service over Lombardia</title>
</result>
<result objidentifier="doi_________::a466b6173773d742b7a5881682748a8c">
<publisher>Zenodo</publisher>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6970067</pid>
<dateofacceptance>2022-08-05</dateofacceptance>
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254" />
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Sentinel-3 NDVI ARD and Long Term Statistics (1999-2019) from the Copernicus Global Land Service over Lombardia</title>
</result>
<result objidentifier="doi_________::172969c66c312a9656fc745f0ec62ce5">
<publisher>Zenodo</publisher>
<dateofacceptance>2022-08-05</dateofacceptance>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6969999</pid>
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254" />
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Sentinel-3 NDVI ARD and Long Term Statistics (1999-2019) from the Copernicus Global Land Service over Lombardia</title>
</result>
<result objidentifier="doi_________::10a910f4a66b7f4bce8407d7a486a80a">
<publisher>Zenodo</publisher>
<dateofacceptance>2022-08-05</dateofacceptance>
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254" />
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Sentinel-3 NDVI ARD and Long Term Statistics (1999-2019) from the Copernicus Global Land Service over Lombardia</title>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6967373</pid>
</result>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254" />
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<dateofacceptance>2022-08-05</dateofacceptance>
<instancetype classid="0021" classname="Dataset" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6967373</pid>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels" schemename="dnet:review_levels" />
<license>https://creativecommons.org/licenses/by/4.0/legalcode</license>
<webresource>
<url>https://doi.org/10.5281/zenodo.6967373</url>
</webresource>
</instance>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254" />
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<dateofacceptance>2022-08-05</dateofacceptance>
<instancetype classid="0021" classname="Dataset" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6970067</pid>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels" schemename="dnet:review_levels" />
<license>https://creativecommons.org/licenses/by/4.0/legalcode</license>
<webresource>
<url>https://doi.org/10.5281/zenodo.6970067</url>
</webresource>
</instance>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254" />
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<dateofacceptance>2022-08-05</dateofacceptance>
<instancetype classid="0021" classname="Dataset" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6969999</pid>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels" schemename="dnet:review_levels" />
<license>https://creativecommons.org/licenses/by/4.0/legalcode</license>
<webresource>
<url>https://doi.org/10.5281/zenodo.6969999</url>
</webresource>
</instance>
<instance>
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254" />
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<dateofacceptance>2022-08-05</dateofacceptance>
<instancetype classid="0021" classname="Dataset" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.6967372</pid>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels" schemename="dnet:review_levels" />
<license>https://creativecommons.org/licenses/by/4.0/legalcode</license>
<webresource>
<url>https://doi.org/10.5281/zenodo.6967372</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

File diff suppressed because one or more lines are too long

View File

@ -1,10 +1,10 @@
{
"eoscifguidelines": [
{
"code": "EOSC::Jupyter Notebook",
"label": "EOSC::Jupyter Notebook",
"url": "",
"semanticRelation": "compliesWith"
"code" : "EOSC::Jupyter Notebook",
"label" : "EOSC::Jupyter Notebook",
"url" : "",
"semanticRelation" : "compliesWith"
}
],
"measures": [
@ -431,26 +431,7 @@
"value": "VIRTA"
}
],
"context": [
{
"id": "eosc",
"dataInfo": [
{
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": "",
"provenanceaction": {
"classid": "sysimport:crosswalk",
"classname": "sysimport:crosswalk",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
]
}
],
"context": [],
"contributor": [],
"country": [],
"coverage": [],

View File

@ -48,25 +48,16 @@
<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
<case to="clear-working-dir">${wf:conf('resume') eq "start"}</case>
<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
<!-- Aggregation of impact scores on the project level -->
<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
<default to="clear-working-dir" />
<default to="create-openaire-ranking-graph" />
</switch>
</decision>
<action name="clear-working-dir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="create-openaire-ranking-graph"/>
<error to="clear-working-dir-fail"/>
</action>
<!-- initial step: create citation network -->
<action name="create-openaire-ranking-graph">
<spark xmlns="uri:oozie:spark-action:0.2">
@ -615,10 +606,6 @@
<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="clear-working-dir-fail">
<message>Re-create working dir failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<!-- Define ending node -->
<end name="end" />

View File

@ -1,32 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-stats-hist-snaps</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.11</version>
<configuration>
<failOnNoGitDirectory>false</failOnNoGitDirectory>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>${jobTracker}</value>
</property>
<property>
<name>nameNode</name>
<value>${nameNode}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_jdbc_url</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
</property>
<property>
<name>oozie.wf.workflow.notification.url</name>
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
</property>
</configuration>

View File

@ -1,223 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export HADOOP_USER_NAME=$2
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
IMPALA_HDFS_NODE=''
COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
break
else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
exit 1
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
function copydb() {
db=$1
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
return 1
fi
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb
# Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \
-copybuffersize 1048576 \
-strategy dynamic \
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
# Check the exit status of the "hadoop distcp" command.
if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
rm -f error.log
return 2
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
echo -e "\nCreating schema for db: '${db}'\n"
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=()
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
fi
fi
fi
done
echo -e "\nAll tables have been created, going to create the views..\n"
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else
echo -e "\nDB '${db}' does not contain any views.\n"
fi
level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again.
should_retry_create_view_statements=()
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement")
else
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi
done
new_num_of_views_to_retry=${#should_retry_create_view_statements}
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi
done
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n"
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
return 4
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
}
MONITOR_DB=$1
#HADOOP_USER_NAME=$2
copydb $MONITOR_DB

View File

@ -1,41 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
SOURCE=$1
PRODUCTION=$2
SHADOW=$3
MONITOR_PROD=$4
MONITOR_IRISH_PROD=$5
echo ${SOURCE}
echo ${PRODUCTION}
#echo "Updating ${PRODUCTION} monitor database old cluster"
#impala-shell -q "create database if not exists ${PRODUCTION}"
#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
echo "Updating ${PRODUCTION} historical snapshots database"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
echo "Production monitor db ready!"
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_PROD}.historical_snapshots"
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_PROD}.historical_snapshots_fos"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_PROD}.historical_snapshots as select * from ${SOURCE}.historical_snapshots"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_PROD}.historical_snapshots_fos as select * from ${SOURCE}.historical_snapshots_fos"
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_IRISH_PROD}.historical_snapshots_irish"
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_IRISH_PROD}.historical_snapshots_irish_fos"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_IRISH_PROD}.historical_snapshots_irish as select * from ${SOURCE}.historical_snapshots_irish"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_IRISH_PROD}.historical_snapshots_irish_fos as select * from ${SOURCE}.historical_snapshots_irish"

View File

@ -1,27 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export SOURCE=$1
export TARGET=$2
export SHADOW=$3
export SCRIPT_PATH=$4
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Getting file from " $4
hdfs dfs -copyToLocal $4
#update Monitor DB IRISH
#cat CreateDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$3/g1" > foo
cat buildIrishMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished"

View File

@ -1,82 +0,0 @@
INSERT INTO ${hist_db_name}.historical_snapshots_fos_tmp
SELECT * FROM ${hist_db_name_prev}.historical_snapshots_fos;
INSERT INTO ${hist_db_name}.historical_snapshots_fos_tmp
select
cast(${hist_date} as STRING),
count(distinct r.id),
r.type,
rf.lvl1,
rf.lvl2,
pf.publicly_funded,
r.access_mode,
r.gold,
r.green,
coalesce(gl.green_with_license,0),
h.is_hybrid,
b.is_bronze_oa,
d.in_diamond_journal,
t.is_transformative,
pr.refereed
from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fos rf on rf.id=r.id
left outer join ${stats_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
left outer join ${stats_db_name}.indi_pub_green_with_license gl on gl.id=r.id
left outer join ${stats_db_name}.indi_pub_bronze_oa b on b.id=r.id
left outer join ${stats_db_name}.indi_pub_diamond d on d.id=r.id
left outer join ${stats_db_name}.indi_pub_in_transformative t on t.id=r.id
left outer join ${stats_db_name}.indi_pub_hybrid h on h.id=r.id
left outer join ${stats_db_name}.result_refereed pr on pr.id=r.id
group by r.green, r.gold, r.access_mode, r.type, rf.lvl1,rf.lvl2, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
drop table if exists ${hist_db_name}.historical_snapshots_fos purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_fos STORED AS PARQUET AS
SELECT * FROM ${hist_db_name}.historical_snapshots_fos_tmp;
drop table if exists ${monitor_db_name}.historical_snapshots_fos purge;
create table ${monitor_db_name}.historical_snapshots_fos stored as parquet
as select * from ${hist_db_name}.historical_snapshots_fos;
drop table ${hist_db_name}.historical_snapshots_fos_tmp purge;
INSERT INTO ${hist_db_name}.historical_snapshots_tmp as
SELECT * FROM ${hist_db_name_prev}.historical_snapshots;
INSERT INTO ${hist_db_name}.historical_snapshots_tmp
select
cast(${hist_date} as STRING),
count(distinct r.id),
r.type,
pf.publicly_funded,
r.access_mode,
r.gold,
r.green,
coalesce(gl.green_with_license,0),
h.is_hybrid,
b.is_bronze_oa,
d.in_diamond_journal,
t.is_transformative,
pr.refereed
from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
left outer join ${stats_db_name}.indi_pub_green_with_license gl on gl.id=r.id
left outer join ${stats_db_name}.indi_pub_bronze_oa b on b.id=r.id
left outer join ${stats_db_name}.indi_pub_diamond d on d.id=r.id
left outer join ${stats_db_name}.indi_pub_in_transformative t on t.id=r.id
left outer join ${stats_db_name}.indi_pub_hybrid h on h.id=r.id
left outer join ${stats_db_name}.result_refereed pr on pr.id=r.id
group by r.green, r.gold, r.access_mode, r.type, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
drop table if exists ${hist_db_name}.historical_snapshots purge;
CREATE TABLE ${hist_db_name}.historical_snapshots STORED AS PARQUET AS
SELECT * FROM ${hist_db_name}.historical_snapshots_tmp;
drop table if exists ${monitor_db_name}.historical_snapshots purge;
create table ${monitor_db_name}.historical_snapshots stored as parquet
as select * from ${hist_db_name}.historical_snapshots;
drop table ${hist_db_name}.historical_snapshots_tmp purge;

View File

@ -1,91 +0,0 @@
INSERT INTO ${hist_db_name}.historical_snapshots_fos_irish_tmp
SELECT * FROM ${hist_db_name_prev}.historical_snapshots_irish_fos;
INSERT INTO ${hist_db_name}.historical_snapshots_fos_irish_tmp
select
cast(${hist_date} as STRING),
count(distinct r.id),
r.type,
rf.lvl1,
rf.lvl2,
pf.publicly_funded,
r.access_mode,
r.gold,
r.green,
coalesce(gl.green_with_license,0),
h.is_hybrid,
b.is_bronze_oa,
d.in_diamond_journal,
t.is_transformative,
pr.refereed
from ${stats_irish_db_name}.result r
left outer join ${stats_irish_db_name}.result_fos rf on rf.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_green_with_license gl on gl.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_bronze_oa b on b.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_diamond d on d.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_in_transformative t on t.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_hybrid h on h.id=r.id
left outer join ${stats_irish_db_name}.result_refereed pr on pr.id=r.id
group by r.green, r.gold, r.access_mode, r.type, rf.lvl1,rf.lvl2, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
drop table if exists ${hist_db_name}.historical_snapshots_irish_fos purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_irish_fos STORED AS PARQUET AS
SELECT * FROM ${hist_db_name}.historical_snapshots_fos_irish_tmp;
drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish_fos purge;
create table ${monitor_irish_db_name}.historical_snapshots_irish_fos stored as parquet
as select * from ${hist_db_name}.historical_snapshots_irish_fos;
drop table ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;
INSERT INTO ${hist_db_name}.historical_snapshots_irish_tmp
SELECT * FROM ${hist_db_name_prev}.historical_snapshots_irish;
INSERT INTO ${hist_db_name}.historical_snapshots_irish_tmp
select
cast(${hist_date} as STRING),
count(distinct r.id),
r.type,
pf.publicly_funded,
r.access_mode,
r.gold,
r.green,
coalesce(gl.green_with_license,0),
h.is_hybrid,
b.is_bronze_oa,
d.in_diamond_journal,
t.is_transformative,
pr.refereed
from ${stats_irish_db_name}.result r
left outer join ${stats_irish_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_green_with_license gl on gl.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_bronze_oa b on b.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_diamond d on d.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_in_transformative t on t.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_hybrid h on h.id=r.id
left outer join ${stats_irish_db_name}.result_refereed pr on pr.id=r.id
group by r.green, r.gold, r.access_mode, r.type, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
drop table if exists ${hist_db_name}.historical_snapshots_irish purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_irish STORED AS PARQUET AS
SELECT * FROM ${hist_db_name}.historical_snapshots_irish_tmp;
drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish purge;
create table ${monitor_irish_db_name}.historical_snapshots_irish stored as parquet
as select * from ${hist_db_name}.historical_snapshots_irish;
drop table ${hist_db_name}.historical_snapshots_irish_tmp purge;
drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish_fos purge;
create table ${monitor_irish_db_name}.historical_snapshots_irish_fos stored as parquet
as select * from ${hist_db_name}.historical_snapshots_irish_fos;
drop table ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;

View File

@ -1,92 +0,0 @@
--------------------------------------------------------------
--------------------------------------------------------------
-- Historical Snapshots database creation
--------------------------------------------------------------
--------------------------------------------------------------
DROP database IF EXISTS ${hist_db_name} CASCADE;
CREATE database ${hist_db_name};
drop table if exists ${hist_db_name}.historical_snapshots_fos_tmp purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_fos_tmp
(
hist_date STRING,
total INT,
type STRING,
lvl1 STRING,
lvl2 STRING,
publicly_funded INT,
accessrights STRING,
gold INT,
green INT,
green_with_license INT,
hybrid INT,
bronze INT,
diamond INT,
transformative INT,
peer_reviewed STRING
)
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');
drop table if exists ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_fos_irish_tmp
(
hist_date STRING,
total INT,
type STRING,
lvl1 STRING,
lvl2 STRING,
publicly_funded INT,
accessrights STRING,
gold INT,
green INT,
green_with_license INT,
hybrid INT,
bronze INT,
diamond INT,
transformative INT,
peer_reviewed STRING
)
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');
drop table if exists ${hist_db_name}.historical_snapshots_tmp purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_tmp
(
hist_date STRING,
total INT,
type STRING,
publicly_funded INT,
accessrights STRING,
gold INT,
green INT,
green_with_license INT,
hybrid INT,
bronze INT,
diamond INT,
transformative INT,
peer_reviewed STRING
)
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');
drop table if exists ${hist_db_name}.historical_snapshots_irish_tmp purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_irish_tmp
(
hist_date STRING,
total INT,
type STRING,
publicly_funded INT,
accessrights STRING,
gold INT,
green INT,
green_with_license INT,
hybrid INT,
bronze INT,
diamond INT,
transformative INT,
peer_reviewed STRING
)
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');

View File

@ -1,159 +0,0 @@
<workflow-app name="Stats Hist Snapshots" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>hist_db_name</name>
<description>the target hist database name</description>
</property>
<property>
<name>hist_db_name_prev</name>
<description>the hist database name of previous_month</description>
</property>
<property>
<name>stats_db_name</name>
<description>the stats db name</description>
</property>
<property>
<name>stats_irish_db_name</name>
<description>the stats irish db name</description>
</property>
<property>
<name>monitor_db_name</name>
<description>the monitor db name</description>
</property>
<property>
<name>monitor_irish_db_name</name>
<description>the irish monitor db name</description>
</property>
<property>
<name>hist_db_prod_name</name>
<description>the production db</description>
</property>
<property>
<name>hist_db_shadow_name</name>
<description>the production shadow db</description>
</property>
<property>
<name>hist_date</name>
<description>the snaps date</description>
</property>
<property>
<name>hive_metastore_uris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>hive_jdbc_url</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hive_timeout</name>
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
</property>
<property>
<name>hadoop_user_name</name>
<description>user name of the wf owner</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>hive.metastore.uris</name>
<value>${hive_metastore_uris}</value>
</property>
<property>
<name>hive.txn.timeout</name>
<value>${hive_timeout}</value>
</property>
<property>
<name>mapred.job.queue.name</name>
<value>analytics</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="CreateDB">${wf:conf('resumeFrom') eq 'CreateDB'}</case>
<case to="BuildHistSnaps">${wf:conf('resumeFrom') eq 'BuildHistSnaps'}</case>
<case to="BuildHistSnapsIrish">${wf:conf('resumeFrom') eq 'BuildHistSnapsIrish'}</case>
<case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
<case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
<default to="BuildHistSnaps"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="CreateDB">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/CreateDB.sql</script>
<param>hist_db_name=${hist_db_name}</param>
</hive2>
<ok to="BuildHistSnaps"/>
<error to="Kill"/>
</action>
<action name="BuildHistSnaps">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/BuildHistSnapsAll.sql</script>
<param>hist_db_name=${hist_db_name}</param>
<param>hist_db_name_prev=${hist_db_name_prev}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>monitor_db_name=${monitor_db_name}</param>
<param>hist_date=${hist_date}</param>
</hive2>
<ok to="BuildHistSnapsIrish"/>
<error to="Kill"/>
</action>
<action name="BuildHistSnapsIrish">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/BuildHistSnapsIrish.sql</script>
<param>hist_db_name=${hist_db_name}</param>
<param>hist_db_name_prev=${hist_db_name_prev}</param>
<param>stats_irish_db_name=${stats_irish_db_name}</param>
<param>monitor_irish_db_name=${monitor_irish_db_name}</param>
<param>hist_date=${hist_date}</param>
</hive2>
<ok to="Step2-copyDataToImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="Step2-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<argument>${hist_db_name}</argument>
<argument>${hadoop_user_name}</argument>
<file>copyDataToImpalaCluster.sh</file>
</shell>
<ok to="Step3-finalizeImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="Step3-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
<argument>${hist_db_name}</argument>
<argument>${hist_db_prod_name}</argument>
<argument>${hist_db_shadow_name}</argument>
<argument>${monitor_db_prod_name}</argument>
<argument>${monitor_irish_db_prod_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,32 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-stats-monitor-irish</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.11</version>
<configuration>
<failOnNoGitDirectory>false</failOnNoGitDirectory>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>${jobTracker}</value>
</property>
<property>
<name>nameNode</name>
<value>${nameNode}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_jdbc_url</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
</property>
<property>
<name>oozie.wf.workflow.notification.url</name>
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
</property>
</configuration>

View File

@ -1,222 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export HADOOP_USER_NAME=$2
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
IMPALA_HDFS_NODE=''
COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
break
else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
exit 1
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
function copydb() {
db=$1
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
return 1
fi
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb
# Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \
-copybuffersize 1048576 \
-strategy dynamic \
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
# Check the exit status of the "hadoop distcp" command.
if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
rm -f error.log
return 2
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
echo -e "\nCreating schema for db: '${db}'\n"
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=()
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
fi
fi
fi
done
echo -e "\nAll tables have been created, going to create the views..\n"
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else
echo -e "\nDB '${db}' does not contain any views.\n"
fi
level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again.
should_retry_create_view_statements=()
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement")
else
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi
done
new_num_of_views_to_retry=${#should_retry_create_view_statements}
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi
done
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n"
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
return 4
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
}
MONITOR_DB=$1
#HADOOP_USER_NAME=$2
copydb $MONITOR_DB

View File

@ -1,23 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
SOURCE=$1
PRODUCTION=$2
echo ${SOURCE}
echo ${PRODUCTION}
#echo "Updating ${PRODUCTION} monitor database old cluster"
#impala-shell -q "create database if not exists ${PRODUCTION}"
#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
echo "Updating ${PRODUCTION} monitor database"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
echo "Production monitor db ready!"

View File

@ -1,28 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export SOURCE=$1
export TARGET=$2
export SHADOW=$3
export SCRIPT_PATH=$4
export GRAPHDB=$5
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Getting file from " $4
hdfs dfs -copyToLocal $4
#update Monitor DB IRISH
#cat CreateDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$3/g1" > foo
cat buildIrishMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$5/g1" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished"

View File

@ -1,241 +0,0 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context;
create view if not exists TARGET.country as select * from SOURCE.country;
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
--create view if not exists TARGET.funder as select * from SOURCE.funder;
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
drop table if exists TARGET.irish_funders;
create TEMPORARY table TARGET.irish_funders as
select distinct xpath_string(fundingtree[0].value, '//funder/name') as funder from GRAPHDB.project
where xpath_string(fundingtree[0].value, '//funder/jurisdiction')='IE';
--create TEMPORARY table TARGET.irish_funders as
--select distinct name as funder from SOURCE.fundref where country='IE';
drop table if exists TARGET.result;
create table TARGET.result stored as parquet as
select distinct * from (
select r.*
from SOURCE.result r
join SOURCE.result_projects rp on rp.id=r.id
join SOURCE.project p on p.id=rp.project
join openaire_prod_stats_monitor_ie_20231226b.irish_funders irf on irf.funder=p.funder
union all
select r.*
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization and o.country='IE'
union all
select r.*
from SOURCE.result r
join SOURCE.result_pids pid on pid.id=r.id
join stats_ext.transformative_facts tf on tf.doi=pid.pid
) foo;
create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context;
create view if not exists TARGET.country as select * from SOURCE.country;
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
create table TARGET.funder stored as parquet as select * from SOURCE.funder where country='IE';
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
drop view TARGET.foo1;
drop view TARGET.foo2;
-- datasources
create view if not exists TARGET.datasource as select * from SOURCE.datasource;
create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
-- organizations
create view if not exists TARGET.organization as select * from SOURCE.organization;
create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources;
create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids;
create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects;
create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources;
-- projects
create view if not exists TARGET.project as select * from SOURCE.project;
create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
-- indicators
-- Sprint 1 ----
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
-- Sprint 2 ----
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
---- Sprint 3 ----
create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country;
create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab;
create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
---- Sprint 4 ----
create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
---- Sprint 5 ----
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
---- Sprint 6 ----
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
---- Sprint 7 ----
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub;
create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year;
create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year;
create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
create view TARGET.indi_is_funder_plan_s as select * from SOURCE.indi_is_funder_plan_s;
create view TARGET.indi_funder_fairness as select * from SOURCE.indi_funder_fairness;
create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_openess;
create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable;
create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);

View File

@ -1,118 +0,0 @@
<workflow-app name="Irish Monitor Update" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>stats_db_name</name>
<description>the target stats database name</description>
</property>
<property>
<name>graph_db_name</name>
<description>the graph database name</description>
</property>
<property>
<name>monitor_irish_db_name</name>
<description>the target monitor db name</description>
</property>
<property>
<name>monitor_irish_db_prod_name</name>
<description>the name of the production monitor db</description>
</property>
<property>
<name>monitor_irish_db_shadow_name</name>
<description>the name of the shadow monitor db</description>
</property>
<property>
<name>hive_metastore_uris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>hive_jdbc_url</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hive_timeout</name>
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
</property>
<property>
<name>hadoop_user_name</name>
<description>user name of the wf owner</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>hive.metastore.uris</name>
<value>${hive_metastore_uris}</value>
</property>
<property>
<name>hive.txn.timeout</name>
<value>${hive_timeout}</value>
</property>
<property>
<name>mapred.job.queue.name</name>
<value>analytics</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="Step1-buildIrishMonitorDB">${wf:conf('resumeFrom') eq 'Step1-buildIrishMonitorDB'}</case>
<case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
<case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
<default to="Step1-buildIrishMonitorDB"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="Step1-buildIrishMonitorDB">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>monitor_irish.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${monitor_irish_db_name}</argument>
<argument>${monitor_irish_db_shadow_name}</argument>
<argument>${wf:appPath()}/scripts/buildIrishMonitorDB.sql</argument>
<argument>${graph_db_name}</argument>
<file>monitor_irish.sh</file>
</shell>
<ok to="Step2-copyDataToImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="Step2-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<argument>${monitor_irish_db_name}</argument>
<argument>${hadoop_user_name}</argument>
<file>copyDataToImpalaCluster.sh</file>
</shell>
<ok to="Step3-finalizeImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="Step3-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
<argument>${monitor_irish_db_name}</argument>
<argument>${monitor_irish_db_prod_name}</argument>
<argument>${monitor_irish_db_shadow_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,32 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-stats-monitor-update</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.11</version>
<configuration>
<failOnNoGitDirectory>false</failOnNoGitDirectory>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>${jobTracker}</value>
</property>
<property>
<name>nameNode</name>
<value>${nameNode}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_jdbc_url</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
</property>
<property>
<name>oozie.wf.workflow.notification.url</name>
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
</property>
</configuration>

View File

@ -1,223 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export HADOOP_USER_NAME=$2
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
IMPALA_HDFS_NODE=''
COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
break
else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
exit 1
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
function copydb() {
db=$1
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
return 1
fi
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb
# Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \
-copybuffersize 1048576 \
-strategy dynamic \
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
# Check the exit status of the "hadoop distcp" command.
if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
rm -f error.log
return 2
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
echo -e "\nCreating schema for db: '${db}'\n"
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=()
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
fi
fi
fi
done
echo -e "\nAll tables have been created, going to create the views..\n"
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else
echo -e "\nDB '${db}' does not contain any views.\n"
fi
level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again.
should_retry_create_view_statements=()
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement")
else
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi
done
new_num_of_views_to_retry=${#should_retry_create_view_statements}
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi
done
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n"
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
return 4
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
}
MONITOR_DB=$1
copydb $MONITOR_DB'_institutions'
copydb $MONITOR_DB

Some files were not shown because too many files have changed in this diff Show More