[AffiliationIngestion]refactoring

[AffiliationIngestion]Extended the ingestion of affiliation from open aire to include also links derived from Web Crawl. Extended the test. Inserted in Constatns the id and name of the webcrawl datasource to be used here and also in the ingestion of links from web crawl
2024-06-29 18:36:47 +02:00 · 2024-06-29 18:35:49 +02:00 · 2024-06-29 18:29:20 +02:00 · 2024-06-28 14:55:18 +02:00 · 2024-06-28 14:54:28 +02:00 · 2024-06-28 12:38:07 +02:00
112 changed files with 3685 additions and 3561 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,4 @@ spark-warehouse
 /**/.factorypath
 /**/.scalafmt.conf
 /.java-version
+/dhp-shade-package/dependency-reduced-pom.xml
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
 		mojo.outputFile = testFolder;

 		// execute
-		Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
+		try {
+			mojo.execute();
+			Assertions.assertTrue(false); // not reached
+		} catch (Exception e) {
+			Assertions
+				.assertTrue(
+					MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
+						IllegalArgumentException.class.isAssignableFrom(e.getClass()));
+		}
 	}

 	@Test
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -70,10 +70,7 @@
 			<groupId>com.ibm.icu</groupId>
 			<artifactId>icu4j</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-common</artifactId>
-		</dependency>
+
 		<dependency>
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
@ -163,7 +160,7 @@

 		<dependency>
 			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>${dhp-schemas.artifact}</artifactId>
+			<artifactId>dhp-schemas</artifactId>
 		</dependency>

 		<dependency>
@ -172,4 +169,23 @@
 		</dependency>
 	</dependencies>

+	<!-- dependencies required on JDK9+ because J2EE has been removed -->
+	<profiles>
+		<profile>
+			<id>spark-34</id>
+			<dependencies>
+				<dependency>
+					<groupId>javax.xml.bind</groupId>
+					<artifactId>jaxb-api</artifactId>
+					<version>2.2.11</version>
+				</dependency>
+				<dependency>
+					<groupId>com.sun.xml.ws</groupId>
+					<artifactId>jaxws-ri</artifactId>
+					<version>2.3.3</version>
+					<type>pom</type>
+				</dependency>
+			</dependencies>
+		</profile>
+	</profiles>
 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java
@ -7,12 +7,12 @@ import java.sql.*;
 import java.util.function.Consumer;

 import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;

 public class DbClient implements Closeable {

-	private static final Logger log = LoggerFactory.getLogger(DbClient.class);
+	private static final Log log = LogFactory.getLog(DbClient.class);

 	private final Connection connection;

@ -37,8 +37,6 @@ public class DbClient implements Closeable {
 		try (final Statement stmt = connection.createStatement()) {
 			stmt.setFetchSize(100);

-			log.info("running SQL:\n\n{}\n\n", sql);
-
 			try (final ResultSet rs = stmt.executeQuery(sql)) {
 				while (rs.next()) {
 					consumer.accept(rs);
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@ -38,7 +38,7 @@ public class PacePerson {
 					PacePerson.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/common/name_particles.txt")));
-		} catch (IOException e) {
+		} catch (Exception e) {
 			throw new RuntimeException(e);
 		}
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java
@ -0,0 +1,53 @@
+
+package eu.dnetlib.dhp.common.api;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import okhttp3.MediaType;
+import okhttp3.RequestBody;
+import okhttp3.internal.Util;
+import okio.BufferedSink;
+import okio.Okio;
+import okio.Source;
+
+public class InputStreamRequestBody extends RequestBody {
+
+	private final InputStream inputStream;
+	private final MediaType mediaType;
+	private final long lenght;
+
+	public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
+
+		return new InputStreamRequestBody(inputStream, mediaType, len);
+	}
+
+	private InputStreamRequestBody(InputStream inputStream, MediaType mediaType, long len) {
+		this.inputStream = inputStream;
+		this.mediaType = mediaType;
+		this.lenght = len;
+	}
+
+	@Override
+	public MediaType contentType() {
+		return mediaType;
+	}
+
+	@Override
+	public long contentLength() {
+
+		return lenght;
+
+	}
+
+	@Override
+	public void writeTo(BufferedSink sink) throws IOException {
+		Source source = null;
+		try {
+			source = Okio.source(inputStream);
+			sink.writeAll(source);
+		} finally {
+			Util.closeQuietly(source);
+		}
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/MissingConceptDoiException.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/MissingConceptDoiException.java
@ -0,0 +1,8 @@
+
+package eu.dnetlib.dhp.common.api;
+
+public class MissingConceptDoiException extends Throwable {
+	public MissingConceptDoiException(String message) {
+		super(message);
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
@ -0,0 +1,363 @@
+
+package eu.dnetlib.dhp.common.api;
+
+import java.io.*;
+import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.http.HttpHeaders;
+import org.apache.http.entity.ContentType;
+import org.jetbrains.annotations.NotNull;
+
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
+import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
+import okhttp3.*;
+
+public class ZenodoAPIClient implements Serializable {
+
+	String urlString;
+	String bucket;
+
+	String deposition_id;
+	String access_token;
+
+	public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");
+
+	private static final MediaType MEDIA_TYPE_ZIP = MediaType.parse("application/zip");
+
+	public String getUrlString() {
+		return urlString;
+	}
+
+	public void setUrlString(String urlString) {
+		this.urlString = urlString;
+	}
+
+	public String getBucket() {
+		return bucket;
+	}
+
+	public void setBucket(String bucket) {
+		this.bucket = bucket;
+	}
+
+	public void setDeposition_id(String deposition_id) {
+		this.deposition_id = deposition_id;
+	}
+
+	public ZenodoAPIClient(String urlString, String access_token) {
+
+		this.urlString = urlString;
+		this.access_token = access_token;
+	}
+
+	/**
+	 * Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
+	 *
+	 * @return response code
+	 * @throws IOException
+	 */
+	public int newDeposition() throws IOException {
+		String json = "{}";
+
+		URL url = new URL(urlString);
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setRequestMethod("POST");
+		conn.setDoOutput(true);
+		try (OutputStream os = conn.getOutputStream()) {
+			byte[] input = json.getBytes("utf-8");
+			os.write(input, 0, input.length);
+		}
+
+		String body = getBody(conn);
+
+		int responseCode = conn.getResponseCode();
+		conn.disconnect();
+
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + body);
+
+		ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
+		this.bucket = newSubmission.getLinks().getBucket();
+		this.deposition_id = newSubmission.getId();
+
+		return responseCode;
+	}
+
+	/**
+	 * Upload files in Zenodo.
+	 *
+	 * @param is the inputStream for the file to upload
+	 * @param file_name the name of the file as it will appear on Zenodo
+	 * @return the response code
+	 */
+	public int uploadIS(InputStream is, String file_name) throws IOException {
+
+		URL url = new URL(bucket + "/" + file_name);
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setDoOutput(true);
+		conn.setRequestMethod("PUT");
+
+		byte[] buf = new byte[8192];
+		int length;
+		try (OutputStream os = conn.getOutputStream()) {
+			while ((length = is.read(buf)) != -1) {
+				os.write(buf, 0, length);
+			}
+
+		}
+		int responseCode = conn.getResponseCode();
+		if (!checkOKStatus(responseCode)) {
+			throw new IOException("Unexpected code " + responseCode + getBody(conn));
+		}
+
+		return responseCode;
+	}
+
+	@NotNull
+	private String getBody(HttpURLConnection conn) throws IOException {
+		String body = "{}";
+		try (BufferedReader br = new BufferedReader(
+			new InputStreamReader(conn.getInputStream(), "utf-8"))) {
+			StringBuilder response = new StringBuilder();
+			String responseLine = null;
+			while ((responseLine = br.readLine()) != null) {
+				response.append(responseLine.trim());
+			}
+
+			body = response.toString();
+
+		}
+		return body;
+	}
+
+	/**
+	 * Associates metadata information to the current deposition
+	 *
+	 * @param metadata the metadata
+	 * @return response code
+	 * @throws IOException
+	 */
+	public int sendMretadata(String metadata) throws IOException {
+
+		URL url = new URL(urlString + "/" + deposition_id);
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setDoOutput(true);
+		conn.setRequestMethod("PUT");
+
+		try (OutputStream os = conn.getOutputStream()) {
+			byte[] input = metadata.getBytes("utf-8");
+			os.write(input, 0, input.length);
+
+		}
+
+		final int responseCode = conn.getResponseCode();
+		conn.disconnect();
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + getBody(conn));
+
+		return responseCode;
+
+	}
+
+	private boolean checkOKStatus(int responseCode) {
+
+		if (HttpURLConnection.HTTP_OK != responseCode ||
+			HttpURLConnection.HTTP_CREATED != responseCode)
+			return true;
+		return false;
+	}
+
+	/**
+	 * To publish the current deposition. It works for both new deposition or new version of an old deposition
+	 *
+	 * @return response code
+	 * @throws IOException
+	 */
+	@Deprecated
+	public int publish() throws IOException {
+
+		String json = "{}";
+
+		OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
+
+		RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
+
+		Request request = new Request.Builder()
+			.url(urlString + "/" + deposition_id + "/actions/publish")
+			.addHeader("Authorization", "Bearer " + access_token)
+			.post(body)
+			.build();
+
+		try (Response response = httpClient.newCall(request).execute()) {
+
+			if (!response.isSuccessful())
+				throw new IOException("Unexpected code " + response + response.body().string());
+
+			return response.code();
+
+		}
+	}
+
+	/**
+	 * To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
+	 * for the new version.
+	 *
+	 * @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
+	 *            part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
+	 *            concept_rec_id = 656930
+	 * @return response code
+	 */
+	public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
+		setDepositionId(concept_rec_id, 1);
+		String json = "{}";
+
+		URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setDoOutput(true);
+		conn.setRequestMethod("POST");
+
+		try (OutputStream os = conn.getOutputStream()) {
+			byte[] input = json.getBytes("utf-8");
+			os.write(input, 0, input.length);
+
+		}
+
+		String body = getBody(conn);
+
+		int responseCode = conn.getResponseCode();
+
+		conn.disconnect();
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + body);
+
+		ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
+		String latest_draft = zenodoModel.getLinks().getLatest_draft();
+		deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
+		bucket = getBucket(latest_draft);
+
+		return responseCode;
+
+	}
+
+	/**
+	 * To finish uploading a version or new deposition not published
+	 * It sets the deposition_id and the bucket to be used
+	 *
+	 *
+	 * @param deposition_id the deposition id of the not yet published upload
+	 *            concept_rec_id = 656930
+	 * @return response code
+	 * @throws IOException
+	 * @throws MissingConceptDoiException
+	 */
+	public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
+
+		this.deposition_id = deposition_id;
+
+		String json = "{}";
+
+		URL url = new URL(urlString + "/" + deposition_id);
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setRequestMethod("POST");
+		conn.setDoOutput(true);
+		try (OutputStream os = conn.getOutputStream()) {
+			byte[] input = json.getBytes("utf-8");
+			os.write(input, 0, input.length);
+		}
+
+		String body = getBody(conn);
+
+		int responseCode = conn.getResponseCode();
+		conn.disconnect();
+
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + body);
+
+		ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
+		bucket = zenodoModel.getLinks().getBucket();
+
+		return responseCode;
+
+	}
+
+	private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
+
+		ZenodoModelList zenodoModelList = new Gson()
+			.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);
+
+		for (ZenodoModel zm : zenodoModelList) {
+			if (zm.getConceptrecid().equals(concept_rec_id)) {
+				deposition_id = zm.getId();
+				return;
+			}
+		}
+		if (zenodoModelList.size() == 0)
+			throw new MissingConceptDoiException(
+				"The concept record id specified was missing in the list of depositions");
+		setDepositionId(concept_rec_id, page + 1);
+
+	}
+
+	private String getPrevDepositions(String page) throws IOException {
+
+		HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
+		urlBuilder.addQueryParameter("page", page);
+
+		URL url = new URL(urlBuilder.build().toString());
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setDoOutput(true);
+		conn.setRequestMethod("GET");
+
+		String body = getBody(conn);
+
+		int responseCode = conn.getResponseCode();
+
+		conn.disconnect();
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + body);
+
+		return body;
+
+	}
+
+	private String getBucket(String inputUurl) throws IOException {
+
+		URL url = new URL(inputUurl);
+		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
+		conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
+		conn.setDoOutput(true);
+		conn.setRequestMethod("GET");
+
+		String body = getBody(conn);
+
+		int responseCode = conn.getResponseCode();
+
+		conn.disconnect();
+		if (!checkOKStatus(responseCode))
+			throw new IOException("Unexpected code " + responseCode + body);
+
+		ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
+
+		return zenodoModel.getLinks().getBucket();
+
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Community.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Community.java
@ -0,0 +1,14 @@
+
+package eu.dnetlib.dhp.common.api.zenodo;
+
+public class Community {
+	private String identifier;
+
+	public String getIdentifier() {
+		return identifier;
+	}
+
+	public void setIdentifier(String identifier) {
+		this.identifier = identifier;
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Creator.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Creator.java
@ -0,0 +1,47 @@
+
+package eu.dnetlib.dhp.common.api.zenodo;
+
+public class Creator {
+	private String affiliation;
+	private String name;
+	private String orcid;
+
+	public String getAffiliation() {
+		return affiliation;
+	}
+
+	public void setAffiliation(String affiliation) {
+		this.affiliation = affiliation;
+	}
+
+	public String getName() {
+		return name;
+	}
+
+	public void setName(String name) {
+		this.name = name;
+	}
+
+	public String getOrcid() {
+		return orcid;
+	}
+
+	public void setOrcid(String orcid) {
+		this.orcid = orcid;
+	}
+
+	public static Creator newInstance(String name, String affiliation, String orcid) {
+		Creator c = new Creator();
+		if (name != null) {
+			c.name = name;
+		}
+		if (affiliation != null) {
+			c.affiliation = affiliation;
+		}
+		if (orcid != null) {
+			c.orcid = orcid;
+		}
+
+		return c;
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/File.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/File.java
@ -0,0 +1,44 @@
+
+package eu.dnetlib.dhp.common.api.zenodo;
+
+import java.io.Serializable;
+
+public class File implements Serializable {
+	private String checksum;
+	private String filename;
+	private long filesize;
+	private String id;
+
+	public String getChecksum() {
+		return checksum;
+	}
+
+	public void setChecksum(String checksum) {
+		this.checksum = checksum;
+	}
+
+	public String getFilename() {
+		return filename;
+	}
+
+	public void setFilename(String filename) {
+		this.filename = filename;
+	}
+
+	public long getFilesize() {
+		return filesize;
+	}
+
+	public void setFilesize(long filesize) {
+		this.filesize = filesize;
+	}
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Grant.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Grant.java
@ -0,0 +1,23 @@
+
+package eu.dnetlib.dhp.common.api.zenodo;
+
+import java.io.Serializable;
+
+public class Grant implements Serializable {
+	private String id;
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	public static Grant newInstance(String id) {
+		Grant g = new Grant();
+		g.id = id;
+
+		return g;
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Links.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Links.java
@ -0,0 +1,92 @@
+
+package eu.dnetlib.dhp.common.api.zenodo;
+
+import java.io.Serializable;
+
+public class Links implements Serializable {
+
+	private String bucket;
+
+	private String discard;
+
+	private String edit;
+	private String files;
+	private String html;
+	private String latest_draft;
+	private String latest_draft_html;
+	private String publish;
+
+	private String self;
+
+	public String getBucket() {
+		return bucket;
+	}
+
+	public void setBucket(String bucket) {
+		this.bucket = bucket;
+	}
+
+	public String getDiscard() {
+		return discard;
+	}
+
+	public void setDiscard(String discard) {
+		this.discard = discard;
+	}
+
+	public String getEdit() {
+		return edit;
+	}
+
+	public void setEdit(String edit) {
+		this.edit = edit;
+	}
+
+	public String getFiles() {
+		return files;
+	}
+
+	public void setFiles(String files) {
+		this.files = files;
+	}
+
+	public String getHtml() {
+		return html;
+	}
+
+	public void setHtml(String html) {
+		this.html = html;
+	}
+
+	public String getLatest_draft() {
+		return latest_draft;
+	}
+
+	public void setLatest_draft(String latest_draft) {
+		this.latest_draft = latest_draft;
+	}
+
+	public String getLatest_draft_html() {
+		return latest_draft_html;
+	}
+
+	public void setLatest_draft_html(String latest_draft_html) {
+		this.latest_draft_html = latest_draft_html;
+	}
+
+	public String getPublish() {
+		return publish;
+	}
+
+	public void setPublish(String publish) {
+		this.publish = publish;
+	}
+
+	public String getSelf() {
+		return self;
+	}
+
+	public void setSelf(String self) {
+		this.self = self;
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Metadata.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Metadata.java
@ -0,0 +1,153 @@
+
+package eu.dnetlib.dhp.common.api.zenodo;
+
+import java.io.Serializable;
+import java.util.List;
+
+public class Metadata implements Serializable {
+
+	private String access_right;
+	private List<Community> communities;
+	private List<Creator> creators;
+	private String description;
+	private String doi;
+	private List<Grant> grants;
+	private List<String> keywords;
+	private String language;
+	private String license;
+	private PrereserveDoi prereserve_doi;
+	private String publication_date;
+	private List<String> references;
+	private List<RelatedIdentifier> related_identifiers;
+	private String title;
+	private String upload_type;
+	private String version;
+
+	public String getUpload_type() {
+		return upload_type;
+	}
+
+	public void setUpload_type(String upload_type) {
+		this.upload_type = upload_type;
+	}
+
+	public String getVersion() {
+		return version;
+	}
+
+	public void setVersion(String version) {
+		this.version = version;
+	}
+
+	public String getAccess_right() {
+		return access_right;
+	}
+
+	public void setAccess_right(String access_right) {
+		this.access_right = access_right;
+	}
+
+	public List<Community> getCommunities() {
+		return communities;
+	}
+
+	public void setCommunities(List<Community> communities) {
+		this.communities = communities;
+	}
+
+	public List<Creator> getCreators() {
+		return creators;
+	}
+
+	public void setCreators(List<Creator> creators) {
+		this.creators = creators;
+	}
+
+	public String getDescription() {
+		return description;
+	}
+
+	public void setDescription(String description) {
+		this.description = description;
+	}
+
+	public String getDoi() {
+		return doi;
+	}
+
+	public void setDoi(String doi) {
+		this.doi = doi;
+	}
+
+	public List<Grant> getGrants() {
+		return grants;
+	}
+
+	public void setGrants(List<Grant> grants) {
+		this.grants = grants;
+	}
+
+	public List<String> getKeywords() {
+		return keywords;
+	}
+
+	public void setKeywords(List<String> keywords) {
+		this.keywords = keywords;
+	}
+
+	public String getLanguage() {
+		return language;
+	}
+
+	public void setLanguage(String language) {
+		this.language = language;
+	}
+
+	public String getLicense() {
+		return license;
+	}
+
+	public void setLicense(String license) {
+		this.license = license;
+	}
+
+	public PrereserveDoi getPrereserve_doi() {
+		return prereserve_doi;
+	}
+
+	public void setPrereserve_doi(PrereserveDoi prereserve_doi) {
+		this.prereserve_doi = prereserve_doi;
+	}
+
+	public String getPublication_date() {
+		return publication_date;
+	}
+
+	public void setPublication_date(String publication_date) {
+		this.publication_date = publication_date;
+	}
+
+	public List<String> getReferences() {
+		return references;
+	}
+
+	public void setReferences(List<String> references) {
+		this.references = references;
+	}
+
+	public List<RelatedIdentifier> getRelated_identifiers() {
+		return related_identifiers;
+	}
+
+	public void setRelated_identifiers(List<RelatedIdentifier> related_identifiers) {
+		this.related_identifiers = related_identifiers;
+	}
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String title) {
+		this.title = title;
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/PrereserveDoi.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/PrereserveDoi.java
@ -0,0 +1,25 @@
+
+package eu.dnetlib.dhp.common.api.zenodo;
+
+import java.io.Serializable;
+
+public class PrereserveDoi implements Serializable {
+	private String doi;
+	private String recid;
+
+	public String getDoi() {
+		return doi;
+	}
+
+	public void setDoi(String doi) {
+		this.doi = doi;
+	}
+
+	public String getRecid() {
+		return recid;
+	}
+
+	public void setRecid(String recid) {
+		this.recid = recid;
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/RelatedIdentifier.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/RelatedIdentifier.java
@ -0,0 +1,43 @@
+
+package eu.dnetlib.dhp.common.api.zenodo;
+
+import java.io.Serializable;
+
+public class RelatedIdentifier implements Serializable {
+	private String identifier;
+	private String relation;
+	private String resource_type;
+	private String scheme;
+
+	public String getIdentifier() {
+		return identifier;
+	}
+
+	public void setIdentifier(String identifier) {
+		this.identifier = identifier;
+	}
+
+	public String getRelation() {
+		return relation;
+	}
+
+	public void setRelation(String relation) {
+		this.relation = relation;
+	}
+
+	public String getResource_type() {
+		return resource_type;
+	}
+
+	public void setResource_type(String resource_type) {
+		this.resource_type = resource_type;
+	}
+
+	public String getScheme() {
+		return scheme;
+	}
+
+	public void setScheme(String scheme) {
+		this.scheme = scheme;
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/ZenodoModel.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/ZenodoModel.java
@ -0,0 +1,118 @@
+
+package eu.dnetlib.dhp.common.api.zenodo;
+
+import java.io.Serializable;
+import java.util.List;
+
+public class ZenodoModel implements Serializable {
+
+	private String conceptrecid;
+	private String created;
+
+	private List<File> files;
+	private String id;
+	private Links links;
+	private Metadata metadata;
+	private String modified;
+	private String owner;
+	private String record_id;
+	private String state;
+	private boolean submitted;
+	private String title;
+
+	public String getConceptrecid() {
+		return conceptrecid;
+	}
+
+	public void setConceptrecid(String conceptrecid) {
+		this.conceptrecid = conceptrecid;
+	}
+
+	public String getCreated() {
+		return created;
+	}
+
+	public void setCreated(String created) {
+		this.created = created;
+	}
+
+	public List<File> getFiles() {
+		return files;
+	}
+
+	public void setFiles(List<File> files) {
+		this.files = files;
+	}
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}
+
+	public Links getLinks() {
+		return links;
+	}
+
+	public void setLinks(Links links) {
+		this.links = links;
+	}
+
+	public Metadata getMetadata() {
+		return metadata;
+	}
+
+	public void setMetadata(Metadata metadata) {
+		this.metadata = metadata;
+	}
+
+	public String getModified() {
+		return modified;
+	}
+
+	public void setModified(String modified) {
+		this.modified = modified;
+	}
+
+	public String getOwner() {
+		return owner;
+	}
+
+	public void setOwner(String owner) {
+		this.owner = owner;
+	}
+
+	public String getRecord_id() {
+		return record_id;
+	}
+
+	public void setRecord_id(String record_id) {
+		this.record_id = record_id;
+	}
+
+	public String getState() {
+		return state;
+	}
+
+	public void setState(String state) {
+		this.state = state;
+	}
+
+	public boolean isSubmitted() {
+		return submitted;
+	}
+
+	public void setSubmitted(boolean submitted) {
+		this.submitted = submitted;
+	}
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String title) {
+		this.title = title;
+	}
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/ZenodoModelList.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/ZenodoModelList.java
@ -0,0 +1,7 @@
+
+package eu.dnetlib.dhp.common.api.zenodo;
+
+import java.util.ArrayList;
+
+public class ZenodoModelList extends ArrayList<ZenodoModel> {
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.math.NumberUtils;
-import org.apache.commons.lang3.time.DateUtils;
 import org.apache.http.HttpHeaders;
-import org.joda.time.Instant;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -119,7 +119,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 							.getContext()
 							.stream()
 							.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
-							.collect(Collectors.toCollection(ArrayList::new)));
+							.collect(Collectors.toList()));
 			}
 			return (T) res;
 		} else {
@ -1003,41 +1003,4 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 			.orElse(null);
 	}

-	/**
-	 * Implements bad and ugly things that we should get rid of ASAP.
-	 *
-	 * @param value
-	 * @return
-	 * @param <T>
-	 */
-	public static <T extends Oaf> T dedicatedUglyHacks(T value) {
-		if (value instanceof OafEntity) {
-			if (value instanceof Result) {
-				final Result r = (Result) value;
-
-				// Fix for AMS Acta
-				Optional
-					.ofNullable(r.getInstance())
-					.map(
-						instance -> instance
-							.stream()
-							.filter(
-								i -> Optional
-									.ofNullable(i.getHostedby())
-									.map(KeyValue::getKey)
-									.map(dsId -> dsId.equals("10|re3data_____::4cc76bed7ce2fb95fd8e7a2dfde16016"))
-									.orElse(false)))
-					.ifPresent(instance -> instance.forEach(i -> {
-						if (Optional
-							.ofNullable(i.getPid())
-							.map(pid -> pid.stream().noneMatch(p -> p.getValue().startsWith("10.6092/unibo/amsacta")))
-							.orElse(false)) {
-							i.setHostedby(UNKNOWN_REPOSITORY);
-						}
-					}));
-			}
-		}
-		return value;
-	}
-
 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@ -328,7 +328,7 @@ public class MergeUtils {
 		final T merged = mergeOafFields(original, enrich, trust);

 		merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId()));
-		merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1));
+		merged.setPid(unionDistinctLists(merged.getPid(), enrich.getPid(), trust));
 		merged.setDateofcollection(LocalDateTime.now().toString());
 		merged
 			.setDateoftransformation(
@ -432,10 +432,7 @@ public class MergeUtils {

 		// merge datainfo for same context id
 		merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> {
-			ArrayList<DataInfo> di = new ArrayList<>();
-			di.addAll(r.getDataInfo());
-			di.addAll(l.getDataInfo());
-			r.setDataInfo(di);
+			r.getDataInfo().addAll(l.getDataInfo());
 			return r;
 		}));

@ -658,13 +655,6 @@ public class MergeUtils {
 			return d1;
 		}

-		if (StringUtils.contains(d1.getValue(), "null")) {
-			return d2;
-		}
-		if (StringUtils.contains(d2.getValue(), "null")) {
-			return d1;
-		}
-
 		return Stream
 			.of(d1, d2)
 			.min(
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@ -154,5 +154,13 @@
  "unknown":{
    "original":"Unknown",
    "inverse":"Unknown"
+  },
+  "isamongtopnsimilardocuments": {
+    "original": "IsAmongTopNSimilarDocuments",
+    "inverse": "HasAmongTopNSimilarDocuments"
+  },
+  "hasamongtopnsimilardocuments": {
+    "original": "HasAmongTopNSimilarDocuments",
+    "inverse": "IsAmongTopNSimilarDocuments"
  }
 }
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
    val conf: SparkConf = new SparkConf()
    val master = parser.get("master")
    log.info(s"Creating Spark session: Master: $master")
-    SparkSession
+    val b = SparkSession
      .builder()
      .config(conf)
      .appName(getClass.getSimpleName)
-      .master(master)
-      .getOrCreate()
+    if (master != null)
+      b.master(master)
+    b.getOrCreate()
  }

  def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
  }

  def generateScholixResourceFromResult(r: Result): ScholixResource = {
+    val sum = ScholixUtils.resultToSummary(r)
+    if (sum != null)
      generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+    else
+      null
  }

  val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {

  }

+  def invRel(rel: String): String = {
+    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+    if (semanticRelation != null)
+      semanticRelation.inverse
+    else
+      null
+  }
+
  def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
    if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
      val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
    if (persistentIdentifiers.isEmpty)
      return null
    s.setLocalIdentifier(persistentIdentifiers.asJava)
-    if (r.isInstanceOf[Publication])
-      s.setTypology(Typology.publication)
-    else
-      s.setTypology(Typology.dataset)
+//    s.setTypology(r.getResulttype.getClassid)

    s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)

--- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/api/ZenodoAPIClientTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/api/ZenodoAPIClientTest.java
@ -0,0 +1,109 @@
+
+package eu.dnetlib.dhp.common.api;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+@Disabled
+class ZenodoAPIClientTest {
+
+	private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
+	private final String ACCESS_TOKEN = "";
+
+	private final String CONCEPT_REC_ID = "657113";
+
+	private final String depositionId = "674915";
+
+	@Test
+	void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
+		ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
+			ACCESS_TOKEN);
+		Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
+
+		File file = new File(getClass()
+			.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
+			.getPath());
+
+		InputStream is = new FileInputStream(file);
+
+		Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
+
+		String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
+
+		Assertions.assertEquals(200, client.sendMretadata(metadata));
+
+		Assertions.assertEquals(202, client.publish());
+
+	}
+
+	@Test
+	void testNewDeposition() throws IOException {
+
+		ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
+			ACCESS_TOKEN);
+		Assertions.assertEquals(201, client.newDeposition());
+
+		File file = new File(getClass()
+			.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
+			.getPath());
+
+		InputStream is = new FileInputStream(file);
+
+		Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
+
+		String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
+
+		Assertions.assertEquals(200, client.sendMretadata(metadata));
+
+		Assertions.assertEquals(202, client.publish());
+
+	}
+
+	@Test
+	void testNewVersionNewName() throws IOException, MissingConceptDoiException {
+
+		ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
+			ACCESS_TOKEN);
+
+		Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
+
+		File file = new File(getClass()
+			.getResource("/eu/dnetlib/dhp/common/api/newVersion")
+			.getPath());
+
+		InputStream is = new FileInputStream(file);
+
+		Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
+
+		Assertions.assertEquals(202, client.publish());
+
+	}
+
+	@Test
+	void testNewVersionOldName() throws IOException, MissingConceptDoiException {
+
+		ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
+			ACCESS_TOKEN);
+
+		Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
+
+		File file = new File(getClass()
+			.getResource("/eu/dnetlib/dhp/common/api/newVersion2")
+			.getPath());
+
+		InputStream is = new FileInputStream(file);
+
+		Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
+
+		Assertions.assertEquals(202, client.publish());
+
+	}
+
+}
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@ -24,7 +24,7 @@
 				<executions>
 					<execution>
 						<id>scala-compile-first</id>
-						<phase>initialize</phase>
+						<phase>process-resources</phase>
 						<goals>
 							<goal>add-source</goal>
 							<goal>compile</goal>
@ -59,14 +59,6 @@
 			<groupId>edu.cmu</groupId>
 			<artifactId>secondstring</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>com.google.guava</groupId>
-			<artifactId>guava</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>com.google.code.gson</groupId>
-			<artifactId>gson</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-lang3</artifactId>
@ -91,10 +83,6 @@
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>org.apache.commons</groupId>
-			<artifactId>commons-math3</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>com.jayway.jsonpath</groupId>
 			<artifactId>json-path</artifactId>
@ -113,4 +101,90 @@
 		</dependency>
 	</dependencies>

+	<profiles>
+		<profile>
+			<id>spark-24</id>
+			<activation>
+				<activeByDefault>true</activeByDefault>
+			</activation>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-2</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<id>spark-34</id>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-2</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<id>spark-35</id>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-35</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+	</profiles>
+
 </project>
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
 import com.jayway.jsonpath.{Configuration, JsonPath}
 import eu.dnetlib.pace.common.AbstractPaceFunctions
 import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
  val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)

  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(RowEncoder(schema))
+    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
  }

  def rowFromJson(json: String): Row = {
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CountryMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CountryMatch.java
@ -1,48 +0,0 @@
-
-package eu.dnetlib.pace.tree;
-
-import java.util.Map;
-
-import com.wcohen.ss.AbstractStringDistance;
-
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractStringComparator;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
-
-@ComparatorClass("countryMatch")
-public class CountryMatch extends AbstractStringComparator {
-
-	public CountryMatch(Map<String, String> params) {
-		super(params, new com.wcohen.ss.JaroWinkler());
-	}
-
-	public CountryMatch(final double weight) {
-		super(weight, new com.wcohen.ss.JaroWinkler());
-	}
-
-	protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
-		super(weight, ssalgo);
-	}
-
-	@Override
-	public double distance(final String a, final String b, final Config conf) {
-		if (a.isEmpty() || b.isEmpty()) {
-			return -1.0; // return -1 if a field is missing
-		}
-		if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
-			return -1.0; // return -1 if a country is UNKNOWN
-		}
-
-		return a.equals(b) ? 1.0 : 0;
-	}
-
-	@Override
-	public double getWeight() {
-		return super.weight;
-	}
-
-	@Override
-	protected double normalize(final double d) {
-		return d;
-	}
-}
--- a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+    RowEncoder(schema)
+  }
+}
--- a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+    ExpressionEncoder(schema)
+  }
+}
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
@ -336,23 +336,4 @@ public class ComparatorTest extends AbstractPaceTest {
 		System.out.println("compare = " + compare);
 	}

-	@Test
-	public void countryMatch() {
-
-		CountryMatch countryMatch = new CountryMatch(params);
-
-		double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
-		assertEquals(-1.0, result);
-
-		result = countryMatch.distance("CHILE", "UNKNOWN", conf);
-		assertEquals(-1.0, result);
-
-		result = countryMatch.distance("CHILE", "ITALY", conf);
-		assertEquals(0.0, result);
-
-		result = countryMatch.distance("CHILE", "CHILE", conf);
-		assertEquals(1.0, result);
-
-	}
-
 }
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;

 import eu.dnetlib.pace.model.Person;
+import jdk.nashorn.internal.ir.annotations.Ignore;

 public class UtilTest {

--- a/dhp-shade-package/pom.xml
+++ b/dhp-shade-package/pom.xml
@ -0,0 +1,169 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <artifactId>dhp</artifactId>
+        <version>1.2.5-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+
+    </parent>
+
+    <artifactId>dhp-shade-package</artifactId>
+    <packaging>jar</packaging>
+
+    <distributionManagement>
+        <site>
+            <id>DHPSite</id>
+            <url>${dhp.site.stage.path}/dhp-common</url>
+        </site>
+    </distributionManagement>
+
+    <description>This module create a jar of all module dependencies</description>
+
+
+    <dependencies>
+
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-actionmanager</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-aggregation</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-blacklist</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-broker-events</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-dedup-openaire</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-enrichment</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-graph-mapper</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-graph-provision</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-impact-indicators</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-actionsets</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-hist-snaps</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-monitor-irish</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-promote</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-update</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-swh</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-usage-raw-data-update</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-usage-stats-build</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+    </dependencies>
+
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <transformers>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
+                                </transformer>
+                                <!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                                    <resource>META-INF/cxf/bus-extensions.txt</resource>
+                                </transformer>
+                            </transformers>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/maven/**</exclude>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <relocations>
+                                <relocation>
+                                    <pattern>com</pattern>
+                                    <shadedPattern>repackaged.com.google.common</shadedPattern>
+                                    <includes>
+                                        <include>com.google.common.**</include>
+                                    </includes>
+                                </relocation>
+                            </relocations>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -10,6 +10,7 @@ import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
 				resultsRDD
 					.union(projectsRDD)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 			});
 	}

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
@ -105,8 +105,8 @@ public class CreateActionSetFromWebEntries implements Serializable {
 				final String ror = ROR_PREFIX
 					+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
 				ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
-//				ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
-//				ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+				ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
+				ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));

 				return ret
 					.iterator();
@ -140,17 +140,11 @@ public class CreateActionSetFromWebEntries implements Serializable {
 				"institution", functions
 					.explode(
 						functions.col("institutions")))
-
 			.selectExpr(
-				"id", "doi", "institution.ror as ror",
+				"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
 				"institution.country_code as country_code", "publication_year")
 			.distinct();

-//			.selectExpr(
-//				"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
-//				"institution.country_code as country_code", "publication_year")
-//			.distinct();
-
 	}

 	private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@ -1,6 +1,7 @@

 package eu.dnetlib.dhp.collection.plugin.rest;

+import java.util.Map;
 import java.util.Optional;
 import java.util.Spliterator;
 import java.util.Spliterators;
@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;

 import org.apache.commons.lang3.StringUtils;

+import com.google.gson.Gson;
+
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		final String entityXpath = api.getParams().get("entityXpath");
 		final String authMethod = api.getParams().get("authMethod");
 		final String authToken = api.getParams().get("authToken");
+		final String requestHeaderMap = api.getParams().get("requestHeaderMap");
+		Gson gson = new Gson();
+		Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
 		final String resultSizeValue = Optional
 			.ofNullable(api.getParams().get("resultSizeValue"))
 			.filter(StringUtils::isNotBlank)
@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		if (StringUtils.isBlank(resultFormatValue)) {
 			throw new CollectorException("Param 'resultFormatValue' is null or empty");
 		}
-		if (StringUtils.isBlank(queryParams)) {
-			throw new CollectorException("Param 'queryParams' is null or empty");
-		}
 		if (StringUtils.isBlank(entityXpath)) {
 			throw new CollectorException("Param 'entityXpath' is null or empty");
 		}
@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
 			entityXpath,
 			authMethod,
 			authToken,
-			resultOutputFormat);
+			resultOutputFormat,
+			requestHeaders);

 		return StreamSupport
 			.stream(
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@ -9,8 +9,11 @@ import java.net.URL;
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.Queue;
 import java.util.concurrent.PriorityBlockingQueue;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;

 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
@ -18,22 +21,18 @@ import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
-import javax.xml.xpath.XPath;
-import javax.xml.xpath.XPathConstants;
-import javax.xml.xpath.XPathExpression;
-import javax.xml.xpath.XPathExpressionException;
-import javax.xml.xpath.XPathFactory;
+import javax.xml.xpath.*;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.http.HttpHeaders;
-import org.apache.http.entity.ContentType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;

+import com.google.common.collect.Maps;
+
 import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
 import eu.dnetlib.dhp.common.collection.CollectorException;
 import eu.dnetlib.dhp.common.collection.HttpClientParams;
@ -48,20 +47,23 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
 *
 */
 public class RestIterator implements Iterator<String> {
-
 	private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
 	public static final String UTF_8 = "UTF-8";
 	private static final int MAX_ATTEMPTS = 5;

 	private final HttpClientParams clientParams;

-	private final String BASIC = "basic";
+	private final String AUTHBASIC = "basic";
+
+	private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+	private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG
+		+ ">";

 	private final String baseUrl;
 	private final String resumptionType;
 	private final String resumptionParam;
 	private final String resultFormatValue;
-	private String queryParams;
+	private String queryParams = "";
 	private final int resultSizeValue;
 	private int resumptionInt = 0; // integer resumption token (first record to harvest)
 	private int resultTotal = -1;
@ -89,6 +91,11 @@ public class RestIterator implements Iterator<String> {
 	 */
 	private final String resultOutputFormat;

+	/*
+	 * Can be used to set additional request headers, like for content negotiation
+	 */
+	private Map<String, String> requestHeaders;
+
 	/**
 	 * RestIterator class compatible to version 1.3.33
 	 */
@ -107,7 +114,8 @@ public class RestIterator implements Iterator<String> {
 		final String entityXpath,
 		final String authMethod,
 		final String authToken,
-		final String resultOutputFormat) {
+		final String resultOutputFormat,
+		final Map<String, String> requestHeaders) {

 		this.clientParams = clientParams;
 		this.baseUrl = baseUrl;
@ -119,6 +127,7 @@ public class RestIterator implements Iterator<String> {
 		this.authMethod = authMethod;
 		this.authToken = authToken;
 		this.resultOutputFormat = resultOutputFormat;
+		this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();

 		this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
 			: "";
@ -148,7 +157,12 @@ public class RestIterator implements Iterator<String> {
 	}

 	private void initQueue() {
-		this.query = this.baseUrl + "?" + this.queryParams + this.querySize + this.queryFormat;
+		if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) {
+			query = baseUrl;
+		} else {
+			query = baseUrl + "?" + queryParams + querySize + queryFormat;
+		}
+
 		log.info("REST calls starting with {}", this.query);
 	}

@ -162,19 +176,6 @@ public class RestIterator implements Iterator<String> {
 	 */
 	@Override
 	public boolean hasNext() {
-		if (this.recordQueue.isEmpty() && this.query.isEmpty()) {
-			disconnect();
-			return false;
-		}
-		return true;
-	}
-
-	/*
-	 * (non-Javadoc)
-	 * @see java.util.Iterator#next()
-	 */
-	@Override
-	public String next() {
 		synchronized (this.recordQueue) {
 			while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
 				try {
@ -184,6 +185,23 @@ public class RestIterator implements Iterator<String> {
 					throw new RuntimeException(e);
 				}
 			}
+
+			if (!this.recordQueue.isEmpty()) {
+				return true;
+			}
+
+			disconnect();
+			return false;
+		}
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * @see java.util.Iterator#next()
+	 */
+	@Override
+	public String next() {
+		synchronized (this.recordQueue) {
 			return this.recordQueue.poll();
 		}
 	}
@ -209,9 +227,8 @@ public class RestIterator implements Iterator<String> {

 		try {
 			String resultJson;
-			String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+			String resultXml = XML_HEADER;
 			String nextQuery = "";
-			final String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
 			Node resultNode = null;
 			NodeList nodeList = null;
 			String qUrlArgument = "";
@ -226,37 +243,48 @@ public class RestIterator implements Iterator<String> {
 				}
 			}

+			// find pagination page start number in queryParam and remove before start the first query
+			if ((resumptionType.toLowerCase().equals("pagination") || resumptionType.toLowerCase().equals("page"))
+				&& (query.contains("paginationStart="))) {
+
+				final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query);
+				m.find(); // guaranteed to be true for this regex
+
+				String[] pageVal = m.group(0).split("=");
+				pagination = Integer.parseInt(pageVal[1]);
+
+				// remove page start number from query and queryParams
+				queryParams = queryParams.replaceFirst("&?paginationStart=[0-9]+", "");
+				query = query.replaceFirst("&?paginationStart=[0-9]+", "");
+
+			}
+
 			try {
 				log.info("requesting URL [{}]", query);

 				final URL qUrl = new URL(query);
 				log.debug("authMethod: {}", this.authMethod);
-				if ("bearer".equalsIgnoreCase(this.authMethod)) {
-					log.trace("authMethod before inputStream: {}", resultXml);
-					final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
-					conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken);
-					conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
-					conn.setRequestMethod("GET");
-					theHttpInputStream = conn.getInputStream();
-				} else if (this.BASIC.equalsIgnoreCase(this.authMethod)) {
-					log.trace("authMethod before inputStream: {}", resultXml);
-					final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
-					conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken);
-					conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
-					conn.setRequestMethod("GET");
-					theHttpInputStream = conn.getInputStream();
-				} else {
-					theHttpInputStream = qUrl.openStream();
+				if (this.authMethod == "bearer") {
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+					requestHeaders.put("Authorization", "Bearer " + authToken);
+					// requestHeaders.put("Content-Type", "application/json");
+				} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+					requestHeaders.put("Authorization", "Basic " + authToken);
+					// requestHeaders.put("accept", "application/xml");
 				}
+				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+				conn.setRequestMethod("GET");
+				this.setRequestHeader(conn);
+				resultStream = conn.getInputStream();

-				this.resultStream = theHttpInputStream;
 				if ("json".equals(this.resultOutputFormat)) {
 					resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
 					resultXml = JsonUtils.convertToXML(resultJson);
 					this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
 				}

-				if (!(emptyXml).equalsIgnoreCase(resultXml)) {
+				if (!isEmptyXml(resultXml)) {
 					resultNode = (Node) this.xpath
 						.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
 					nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
@ -265,8 +293,7 @@ public class RestIterator implements Iterator<String> {
 						final StringWriter sw = new StringWriter();
 						this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
 						final String toEnqueue = sw.toString();
-						if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue)
-							|| emptyXml.equalsIgnoreCase(toEnqueue)) {
+						if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
 							log
 								.warn(
 									"The following record resulted in empty item for the feeding queue: {}", resultXml);
@ -294,6 +321,7 @@ public class RestIterator implements Iterator<String> {
 							throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
 						}
 						qUrlArgument = qUrl.getQuery();
+
 						final String[] arrayQUrlArgument = qUrlArgument.split("&");
 						for (final String arrayUrlArgStr : arrayQUrlArgument) {
 							if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
@ -307,7 +335,7 @@ public class RestIterator implements Iterator<String> {
 							}
 						}

-						if (((emptyXml).equalsIgnoreCase(resultXml))
+						if (isEmptyXml(resultXml)
 							|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
 							// resumptionStr = "";
 							if (nodeList != null) {
@ -326,13 +354,13 @@ public class RestIterator implements Iterator<String> {

 					case "pagination":
 					case "page": // pagination, iterate over page numbers
-						this.pagination += 1;
-						if (nodeList != null) {
+						if (nodeList != null && nodeList.getLength() > 0) {
 							this.discoverResultSize += nodeList.getLength();
 						} else {
 							this.resultTotal = this.discoverResultSize;
 							this.pagination = this.discoverResultSize;
 						}
+						this.pagination += 1;
 						this.resumptionInt = this.pagination;
 						this.resumptionStr = Integer.toString(this.resumptionInt);
 						break;
@ -380,7 +408,8 @@ public class RestIterator implements Iterator<String> {
 			try {
 				if (this.resultTotal == -1) {
 					this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
-					if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) {
+					if ("page".equalsIgnoreCase(this.resumptionType)
+						&& !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
 						this.resultTotal += 1;
 					} // to correct the upper bound
 					log.info("resultTotal was -1 is now: " + this.resultTotal);
@ -409,6 +438,10 @@ public class RestIterator implements Iterator<String> {

 	}

+	private boolean isEmptyXml(String s) {
+		return EMPTY_XML.equalsIgnoreCase(s);
+	}
+
 	private boolean isInteger(final String s) {
 		boolean isValidInteger = false;
 		try {
@ -433,6 +466,22 @@ public class RestIterator implements Iterator<String> {
 		}
 	}

+	/**
+	 * setRequestHeader
+	 *
+	 * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
+	 * @param conn
+	 */
+	private void setRequestHeader(HttpURLConnection conn) {
+		if (requestHeaders != null) {
+			for (String key : requestHeaders.keySet()) {
+				conn.setRequestProperty(key, requestHeaders.get(key));
+			}
+			log.debug("Set Request Header with: " + requestHeaders);
+		}
+
+	}
+
 	public String getResultFormatValue() {
 		return this.resultFormatValue;
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java
@ -8,7 +8,10 @@ import java.io.StringWriter;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
+import java.util.Arrays;
 import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;

 import javax.xml.stream.XMLEventFactory;
 import javax.xml.stream.XMLEventReader;
@ -19,6 +22,7 @@ import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.events.StartElement;
 import javax.xml.stream.events.XMLEvent;

+import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

@ -58,13 +62,23 @@ public class XMLIterator implements Iterator<String> {

 	private String element;

+	private List<String> elements;
+
 	private InputStream inputStream;

 	public XMLIterator(final String element, final InputStream inputStream) {
 		super();
 		this.element = element;
+		if (element.contains(",")) {
+			elements = Arrays
+				.stream(element.split(","))
+				.filter(StringUtils::isNoneBlank)
+				.map(String::toLowerCase)
+				.collect(Collectors.toList());
+		}
 		this.inputStream = inputStream;
 		this.parser = getParser();
+
 		try {
 			this.current = findElement(parser);
 		} catch (XMLStreamException e) {
@ -113,7 +127,7 @@ public class XMLIterator implements Iterator<String> {
 				final XMLEvent event = parser.nextEvent();

 				// TODO: replace with depth tracking instead of close tag tracking.
-				if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
+				if (event.isEndElement() && isCheckTag(event.asEndElement().getName().getLocalPart())) {
 					writer.add(event);
 					break;
 				}
@ -142,31 +156,48 @@ public class XMLIterator implements Iterator<String> {
 		XMLEvent peek = parser.peek();
 		if (peek != null && peek.isStartElement()) {
 			String name = peek.asStartElement().getName().getLocalPart();
-			if (element.equals(name)) {
+			if (isCheckTag(name))
 				return peek;
 		}
-		}

 		while (parser.hasNext()) {
-			final XMLEvent event = parser.nextEvent();
+			XMLEvent event = parser.nextEvent();
 			if (event != null && event.isStartElement()) {
 				String name = event.asStartElement().getName().getLocalPart();
-				if (element.equals(name)) {
+				if (isCheckTag(name))
 					return event;
 			}
 		}
-		}
 		return null;
 	}

 	private XMLEventReader getParser() {
 		try {
-			return inputFactory.get().createXMLEventReader(sanitize(inputStream));
+			XMLInputFactory xif = inputFactory.get();
+			xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
+			return xif.createXMLEventReader(sanitize(inputStream));
 		} catch (XMLStreamException e) {
 			throw new RuntimeException(e);
 		}
 	}

+	private boolean isCheckTag(final String tagName) {
+		if (elements != null) {
+			final String found = elements
+				.stream()
+				.filter(e -> e.equalsIgnoreCase(tagName))
+				.findFirst()
+				.orElse(null);
+			if (found != null)
+				return true;
+		} else {
+			if (element.equalsIgnoreCase(tagName)) {
+				return true;
+			}
+		}
+		return false;
+	}
+
 	private Reader sanitize(final InputStream in) {
 		final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
 		charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DataFetcher.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DataFetcher.java
@ -1,134 +0,0 @@
-package eu.dnetlib.dhp.transformation.xslt;
-
-import java.io.Serializable;
-import net.sf.saxon.s9api.*;
-
-import org.apache.commons.io.IOUtils;
-import org.json.JSONObject;
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * This class fetches JSON from a provided link and returns
- * a Dublin Core. This functionality is particularly needed for OSF Preprints
- */
-
-
-public class DataFetcher implements ExtensionFunction, Serializable {
-
-    /**
-     * This method fetches JSON object from a given URL
-     * @param url a url in the metadata for fetching authors in JSON format
-     * @return
-     * @throws IOException
-     */
-    static JSONObject getJson(URL url) throws IOException {
-
-        String json = IOUtils.toString(url);
-        return new JSONObject(json);
-    }
-
-    /**
-     * This method extracts authors from a given JSON
-     *
-     * @param jsonObject
-     * @return
-     */
-    static List<String> getAuthorsFromJson(JSONObject jsonObject) {
-        List<String> authors = new ArrayList<>();
-        // count of authors
-        int countOfAuthors = jsonObject.getJSONArray("data").length();
-        for (int i = 0; i < countOfAuthors; i++) {
-
-            authors.add(jsonObject
-                    .getJSONArray("data")
-                    .getJSONObject(i)
-                    .getJSONObject("embeds")
-                    .getJSONObject("users")
-                    .getJSONObject("data")
-                    .getJSONObject("attributes")
-                    .getString("full_name"));
-        }
-        return authors;
-    }
-
-    /**
-     * This method transforms list of authors into Dublin Core
-     * @param authors
-     * @return Dublin Core list of authors
-     */
-    static List<String> transformListToDublinCore(List<String> authors) {
-
-        List<String> dublinCoreAuthors = new ArrayList<>();
-        for (String author : authors){
-
-            //splitting full name into first and last names according to OpenAIRE v3 guidelines at:
-            // https://guidelines.openaire.eu/en/latest/literature/field_creator.html
-            // “surname”, “initials” (“first name”) “prefix”.
-            String[] parts = author.split(" ");
-            String firstName = parts[0];
-            String lastName = parts[1];
-            char initialOfFirstName = firstName.charAt(0);
-
-            dublinCoreAuthors.add(
-                    "<dc:creator>" + lastName + ", " + initialOfFirstName +  ". (" + firstName + ")" + "</dc:creator>");
-        }
-        return dublinCoreAuthors;
-    }
-
-     /**
-     * This is a public method which fetches authors and transform them into Dublin Core
-     */
-    public static String getAndTransformAuthors(URL url) throws IOException{
-        return String.join(", ", transformListToDublinCore(getAuthorsFromJson(getJson(url))));
-    }
-
-
-    /**
-     * This method extracts link to fulltext from a given JSON
-     *
-     * @return
-     */
-    static private String getLinkToFulltextFromJson(JSONObject jsonObject) throws MalformedURLException {
-
-        // note: Link to JSON containing fulltextlink is in "primary_file" attribute.
-        // And in the resultant JSON,  “links->download” contains the URL to fulltext
-
-        return jsonObject
-                .getJSONObject("data")
-                .getJSONObject("links")
-                .getString("download");
-    }
-
-    /**
-     * This is a public method which fetches link to full text and returns it as a suitable format
-     */
-    public static String getFullTextLinkAndTransform (URL url )throws IOException{
-
-        return getLinkToFulltextFromJson(getJson(url));
-    }
-
-
-    @Override
-    public QName getName() {
-        return null;
-    }
-
-    @Override
-    public SequenceType getResultType() {
-        return null;
-    }
-
-    @Override
-    public SequenceType[] getArgumentTypes() {
-        return new SequenceType[0];
-    }
-
-    @Override
-    public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
-        return null;
-    }
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java
@ -55,8 +55,6 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
 		processor.registerExtensionFunction(new DateCleaner());
 		processor.registerExtensionFunction(new PersonCleaner());

-		processor.registerExtensionFunction(new DataFetcher());
-
 		final XsltCompiler comp = processor.newXsltCompiler();
 		QName datasourceIDParam = new QName(DATASOURCE_ID_PARAM);
 		comp.setParameter(datasourceIDParam, new XdmAtomicValue(value.getProvenance().getDatasourceId()));
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
@ -35,6 +35,5 @@ crossrefInputPath=/data/bip-affiliations/crossref-data.json
 pubmedInputPath=/data/bip-affiliations/pubmed-data.json
 openapcInputPath=/data/bip-affiliations/openapc-data.json
 dataciteInputPath=/data/bip-affiliations/datacite-data.json
-webCrawlInputPath=/data/bip-affiliations/webCrawl/

 outputPath=/tmp/crossref-affiliations-output-v5
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
@ -1,5 +1,10 @@
 [
-
+  {
+    "id": "100007630",
+    "uri": "http://dx.doi.org/10.13039/100007630",
+    "name": "College of Engineering and Informatics, National University of Ireland, Galway",
+    "synonym": []
+  },
  {
    "id": "100007731",
    "uri": "http://dx.doi.org/10.13039/100007731",
@ -427,13 +432,13 @@
    "id": "501100001634",
    "uri": "http://dx.doi.org/10.13039/501100001634",
    "name": "University of Galway",
-    "synonym": ["501100019905", "100007630", "501100020570", "501100023852"]
+    "synonym": []
  },
  {
    "id": "501100001635",
    "uri": "http://dx.doi.org/10.13039/501100001635",
    "name": "University of Limerick",
-    "synonym": ["501100014531"]
+    "synonym": []
  },
  {
    "id": "501100001636",
@ -463,7 +468,7 @@
    "id": "501100002736",
    "uri": "http://dx.doi.org/10.13039/501100002736",
    "name": "Covidien",
-    "synonym": ["501100003956"]
+    "synonym": []
  },
  {
    "id": "501100002755",
@ -513,6 +518,12 @@
    "name": "Irish Institute of Clinical Neuroscience",
    "synonym": []
  },
+  {
+    "id": "501100003956",
+    "uri": "http://dx.doi.org/10.13039/501100003956",
+    "name": "Aspect Medical Systems",
+    "synonym": []
+  },
  {
    "id": "501100004162",
    "uri": "http://dx.doi.org/10.13039/501100004162",
@ -633,7 +644,12 @@
    "name": "Irish Centre for High-End Computing",
    "synonym": []
  },
-
+  {
+    "id": "501100019905",
+    "uri": "http://dx.doi.org/10.13039/501100019905",
+    "name": "Galway University Foundation",
+    "synonym": []
+  },
  {
    "id": "501100020036",
    "uri": "http://dx.doi.org/10.13039/501100020036",
@ -808,7 +824,12 @@
    "name": "Energy Policy Research Centre, Economic and Social Research Institute",
    "synonym": []
  },
-
+  {
+    "id": "501100014531",
+    "uri": "http://dx.doi.org/10.13039/501100014531",
+    "name": "Physical Education and Sport Sciences Department, University of Limerick",
+    "synonym": []
+  },
  {
    "id": "501100014745",
    "uri": "http://dx.doi.org/10.13039/501100014745",
@ -821,11 +842,22 @@
    "name": "ADAPT - Centre for Digital Content Technology",
    "synonym": []
  },
-
+  {
+    "id": "501100020570",
+    "uri": "http://dx.doi.org/10.13039/501100020570",
+    "name": "College of Medicine, Nursing and Health Sciences, National University of Ireland, Galway",
+    "synonym": []
+  },
  {
    "id": "501100020871",
    "uri": "http://dx.doi.org/10.13039/501100020871",
    "name": "Bernal Institute, University of Limerick",
    "synonym": []
+  },
+  {
+    "id": "501100023852",
+    "uri": "http://dx.doi.org/10.13039/501100023852",
+    "name": "Moore Institute for Research in the Humanities and Social Studies, University of Galway",
+    "synonym": []
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml
@ -48,37 +48,12 @@
            <description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
        </property>

-        <property>
-            <name>JAVA_HOME</name>
-            <value>/srv/java/openjdk-17</value>
-            <description>Used to configure the Java home location for oozie.launcher.mapreduce.map.env</description>
-        </property>
-
-        <property>
-            <name>JAVA_OPTS</name>
-                <value>-Dcom.sun.security.enableAIAcaIssuers=true</value>
-            <description>Used to configure the JAVA_OPTS parameter</description>
-        </property>

    </parameters>

    <global>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
-        <configuration>
-            <property>
-                <name>mapreduce.job.queuename</name>
-                <value>${queueName}</value>
-            </property>
-            <property>
-                <name>oozie.launcher.mapred.job.queue.name</name>
-                <value>${oozieLauncherQueueName}</value>
-            </property>
-            <property>
-                <name>oozie.launcher.mapreduce.map.env</name>
-                <value>JAVA_HOME=${JAVA_HOME}</value>
-            </property>
-        </configuration>
    </global>

    <start to="collection_mode"/>
@ -124,7 +99,7 @@
    <action name="CollectionWorker">
        <java>
            <main-class>eu.dnetlib.dhp.collection.CollectorWorkerApplication</main-class>
-            <java-opts>${JAVA_OPTS} ${collection_java_xmx}</java-opts>
+            <java-opts>${collection_java_xmx}</java-opts>
            <arg>--apidescriptor</arg><arg>${apiDescription}</arg>
            <arg>--namenode</arg><arg>${nameNode}</arg>
            <arg>--workflowId</arg><arg>${workflowId}</arg>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/db/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/db/oozie_app/workflow.xml
@ -8,40 +8,19 @@
            <name>database</name>
            <description>the PDB Database Working Path</description>
        </property>
+
        <property>
-            <name>mdStoreOutputId</name>
-            <description>the identifier of the cleaned MDStore</description>
-        </property>
-        <property>
-            <name>mdStoreManagerURI</name>
-            <description>the path of the cleaned mdstore</description>
+            <name>targetPath</name>
+            <description>the Target Working dir path</description>
        </property>
    </parameters>

-    <start to="StartTransaction"/>
-
+    <start to="ConvertDB"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>

-    <action name="StartTransaction">
-        <java>
-            <configuration>
-                <property>
-                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
-                    <value>true</value>
-                </property>
-            </configuration>
-            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
-            <arg>--action</arg><arg>NEW_VERSION</arg>
-            <arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
-            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
-            <capture-output/>
-        </java>
-        <ok to="ConvertDB"/>
-        <error to="RollBack"/>
-    </action>
    <action name="ConvertDB">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -62,48 +41,11 @@
            <arg>--master</arg><arg>yarn</arg>
            <arg>--dbPath</arg><arg>${sourcePath}</arg>
            <arg>--database</arg><arg>${database}</arg>
-            <arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}</arg>
        </spark>
-        <ok to="CommitVersion"/>
-        <error to="RollBack"/>
-
-    </action>
-        <action name="CommitVersion">
-            <java>
-                <configuration>
-                    <property>
-                        <name>oozie.launcher.mapreduce.user.classpath.first</name>
-                        <value>true</value>
-                    </property>
-                </configuration>
-                <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
-                <arg>--action</arg><arg>COMMIT</arg>
-                <arg>--namenode</arg><arg>${nameNode}</arg>
-                <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
-                <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
-            </java>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
-
-        <action name="RollBack">
-            <java>
-                <configuration>
-                    <property>
-                        <name>oozie.launcher.mapreduce.user.classpath.first</name>
-                        <value>true</value>
-                    </property>
-                </configuration>
-                <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
-                <arg>--action</arg><arg>ROLLBACK</arg>
-                <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
-                <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
-            </java>
-            <ok to="Kill"/>
-            <error to="Kill"/>
-        </action>
-
-
    <end name="End"/>

 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json
@ -2,5 +2,5 @@
  {"paramName":"mt",  "paramLongName":"master",       "paramDescription": "should be local or yarn",                  "paramRequired": true},
  {"paramName":"db",  "paramLongName":"database",     "paramDescription": "should be PDB or UNIPROT",                 "paramRequired": true},
  {"paramName":"p",   "paramLongName":"dbPath",       "paramDescription": "the path of the database to transform",    "paramRequired": true},
-  {"paramName":"mo",   "paramLongName":"mdstoreOutputVersion",     "paramDescription": "the oaf path ",                "paramRequired": true}
+  {"paramName":"t",   "paramLongName":"targetPath",   "paramDescription": "the OAF target path ",                     "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json
@ -1,20 +1,5 @@
 [
-  {
-    "paramName": "mt",
-    "paramLongName": "master",
-    "paramDescription": "should be local or yarn",
-    "paramRequired": true
-  },
-  {
-    "paramName": "s",
-    "paramLongName": "sourcePath",
-    "paramDescription": "the source Path",
-    "paramRequired": true
-  },
-  {
-    "paramName": "mo",
-    "paramLongName": "mdstoreOutputVersion",
-    "paramDescription": "the oaf path ",
-    "paramRequired": true
-  }
+  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",                  "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath","paramDescription": "the source Path",                              "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath","paramDescription": "the  oaf path ",  "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml
@ -9,26 +9,34 @@
            <description>the Working Path</description>
        </property>
        <property>
-            <name>mdStoreOutputId</name>
-            <description>the identifier of the cleaned MDStore</description>
+            <name>targetPath</name>
+            <description>the OAF MDStore Path</description>
        </property>
        <property>
-            <name>mdStoreManagerURI</name>
-            <description>the path of the cleaned mdstore</description>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>resumeFrom</name>
-            <value>CreateEBIDataSet</value>
+            <value>DownloadEBILinks</value>
            <description>node to start</description>
        </property>
    </parameters>

-    <start to="StartTransaction"/>
+    <start to="resume_from"/>

    <decision name="resume_from">
        <switch>
            <case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
-            <case to="StartTransaction">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
+            <case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
            <default to="DownloadEBILinks"/>
        </switch>
    </decision>
@ -69,29 +77,9 @@
            <move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
            <move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
        </fs>
-        <ok to="StartTransaction"/>
+        <ok to="CreateEBIDataSet"/>
        <error to="Kill"/>
    </action>
-
-    <action name="StartTransaction">
-        <java>
-            <configuration>
-                <property>
-                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
-                    <value>true</value>
-                </property>
-            </configuration>
-            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
-            <arg>--action</arg><arg>NEW_VERSION</arg>
-            <arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
-            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
-            <capture-output/>
-        </java>
-        <ok to="CreateEBIDataSet"/>
-        <error to="RollBack"/>
-    </action>
-
-
    <action name="CreateEBIDataSet">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
@ -107,49 +95,11 @@
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
-            <arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}</arg>
            <arg>--master</arg><arg>yarn</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
-
-
-    <action name="CommitVersion">
-        <java>
-            <configuration>
-                <property>
-                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
-                    <value>true</value>
-                </property>
-            </configuration>
-            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
-            <arg>--action</arg><arg>COMMIT</arg>
-            <arg>--namenode</arg><arg>${nameNode}</arg>
-            <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
-            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
-        </java>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="RollBack">
-        <java>
-            <configuration>
-                <property>
-                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
-                    <value>true</value>
-                </property>
-            </configuration>
-            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
-            <arg>--action</arg><arg>ROLLBACK</arg>
-            <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
-            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
-        </java>
-        <ok to="Kill"/>
-        <error to="Kill"/>
-    </action>
-
    <end name="End"/>
-
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
            tp._1 match {
              case "electronic" => journal.setIssnOnline(tp._2)
              case "print"      => journal.setIssnPrinted(tp._2)
+              case _            =>
            }
          })
        }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala
@ -231,7 +231,7 @@ object BioDBToOAF {
  def uniprotToOAF(input: String): List[Oaf] = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json = parse(input)
-    val pid = (json \ "pid").extract[String].trim()
+    val pid = (json \ "pid").extract[String]

    val d = new Dataset

--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala
@ -2,15 +2,12 @@ package eu.dnetlib.dhp.sx.bio

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.collection.CollectionUtils
-import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
 import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
-import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}

 object SparkTransformBioDatabaseToOAF {

@ -28,13 +25,8 @@ object SparkTransformBioDatabaseToOAF {

    val dbPath: String = parser.get("dbPath")
    log.info("dbPath: {}", database)
-
-    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
-    log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
-
-    val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
-    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
-    log.info("outputBasePath: {}", outputBasePath)
+    val targetPath: String = parser.get("targetPath")
+    log.info("targetPath: {}", database)

    val spark: SparkSession =
      SparkSession
@ -51,28 +43,24 @@ object SparkTransformBioDatabaseToOAF {
      case "UNIPROT" =>
        CollectionUtils.saveDataset(
          spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
-          s"$outputBasePath/$MDSTORE_DATA_PATH"
+          targetPath
        )
      case "PDB" =>
        CollectionUtils.saveDataset(
          spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
-          s"$outputBasePath/$MDSTORE_DATA_PATH"
+          targetPath
        )
      case "SCHOLIX" =>
        CollectionUtils.saveDataset(
          spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
-          s"$outputBasePath/$MDSTORE_DATA_PATH"
+          targetPath
        )
      case "CROSSREF_LINKS" =>
        CollectionUtils.saveDataset(
          spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
-          s"$outputBasePath/$MDSTORE_DATA_PATH"
+          targetPath
        )
    }
-
-    val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
-    val mdStoreSize = df.count
-    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
  }

 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.collection.CollectionUtils
-import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
+import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.sx.bio.pubmed._
-import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
 import org.apache.http.impl.client.HttpClientBuilder
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql._
+import org.apache.spark.sql.expressions.Aggregator
 import org.slf4j.{Logger, LoggerFactory}

-import java.io.InputStream
-import scala.io.Source
-import scala.xml.pull.XMLEventReader
+import java.io.{ByteArrayInputStream, InputStream}
+import java.nio.charset.Charset
+import javax.xml.stream.XMLInputFactory

 object SparkCreateBaselineDataFrame {

@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
          if (response.getStatusLine.getStatusCode > 400) {
            tries -= 1
          } else
-            return IOUtils.toString(response.getEntity.getContent)
+            return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
        } catch {
          case e: Throwable =>
            println(s"Error on requesting ${r.getURI}")
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
      IOUtils.toString(
        SparkEBILinksToOaf.getClass.getResourceAsStream(
          "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
-        )
+        ),
+        Charset.defaultCharset()
      )
    )
    parser.parseArgument(args)
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
    val workingPath = parser.get("workingPath")
    log.info("workingPath: {}", workingPath)

-    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
-    log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
-
-    val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
-    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
-    log.info("outputBasePath: {}", outputBasePath)
+    val targetPath = parser.get("targetPath")
+    log.info("targetPath: {}", targetPath)

    val hdfsServerUri = parser.get("hdfsServerUri")
-    log.info("hdfsServerUri: {}", hdfsServerUri)
+    log.info("hdfsServerUri: {}", targetPath)

    val skipUpdate = parser.get("skipUpdate")
    log.info("skipUpdate: {}", skipUpdate)
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
    if (!"true".equalsIgnoreCase(skipUpdate)) {
      downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
      val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
+      val inputFactory = XMLInputFactory.newInstance
      val ds: Dataset[PMArticle] = spark.createDataset(
        k.filter(i => i._1.endsWith(".gz"))
          .flatMap(i => {
-            val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+            val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
            new PMParser(xml)
          })
      )
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
        .map(a => PubMedToOaf.convert(a, vocabularies))
        .as[Oaf]
        .filter(p => p != null),
-      s"$outputBasePath/$MDSTORE_DATA_PATH"
+      targetPath
    )

-    val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
-    val mdStoreSize = df.count
-    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
@ -9,9 +9,6 @@ import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql._
 import org.slf4j.{Logger, LoggerFactory}
-import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
-import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}

 object SparkEBILinksToOaf {

@ -35,13 +32,8 @@ object SparkEBILinksToOaf {
    import spark.implicits._
    val sourcePath = parser.get("sourcePath")
    log.info(s"sourcePath  -> $sourcePath")
-    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
-    log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
-
-    val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
-    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
-    log.info("outputBasePath: {}", outputBasePath)
-
+    val targetPath = parser.get("targetPath")
+    log.info(s"targetPath  -> $targetPath")
    implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])

    val ebLinks: Dataset[EBILinkItem] = spark.read
@ -54,10 +46,7 @@ object SparkEBILinksToOaf {
        .flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
        .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
        .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
-      s"$outputBasePath/$MDSTORE_DATA_PATH"
+      targetPath
    )
-    val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
-    val mdStoreSize = df.count
-    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@ -1,7 +1,8 @@
 package eu.dnetlib.dhp.sx.bio.pubmed

 import scala.xml.MetaData
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+import javax.xml.stream.XMLEventReader
+import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}

 /** @param xml
  */
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@ -119,7 +119,9 @@ public class ReadCOCITest {
 					workingDir.toString() + "/COCI",
 					"-outputPath",
 					workingDir.toString() + "/COCI_json/",
-					"-inputFile", "input1;input2;input3;input4;input5"
+					"-inputFile", "input1;input2;input3;input4;input5",
+					"-format",
+					"COCI"
 				});

 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipMultipleNodeTest.java
@ -0,0 +1,64 @@
+
+package eu.dnetlib.dhp.collection.plugin.file;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Objects;
+import java.util.stream.Stream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.junit.jupiter.MockitoExtension;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
+import eu.dnetlib.dhp.common.collection.CollectorException;
+
+@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
+@ExtendWith(MockitoExtension.class)
+public class FileGZipMultipleNodeTest {
+
+	private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
+
+	private final ApiDescriptor api = new ApiDescriptor();
+
+	private FileGZipCollectorPlugin plugin;
+
+	private static final String SPLIT_ON_ELEMENT = "incollection,article";
+
+	@BeforeEach
+	public void setUp() throws IOException {
+
+		final String gzipFile = Objects
+			.requireNonNull(
+				this
+					.getClass()
+					.getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz"))
+			.getFile();
+
+		api.setBaseUrl(gzipFile);
+
+		HashMap<String, String> params = new HashMap<>();
+		params.put("splitOnElement", SPLIT_ON_ELEMENT);
+
+		api.setParams(params);
+
+		FileSystem fs = FileSystem.get(new Configuration());
+		plugin = new FileGZipCollectorPlugin(fs);
+	}
+
+	@Test
+	void test() throws CollectorException {
+
+		final Stream<String> stream = plugin.collect(api, new AggregatorReport());
+
+		stream.limit(10).forEach(s -> {
+			Assertions.assertTrue(s.length() > 0);
+			log.info(s);
+		});
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@ -36,11 +36,11 @@ public class OsfPreprintCollectorTest {
 	private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";

 	private final String resumptionParam = "page";
-	private final String resumptionType = "page";
-	private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
+	private final String resumptionType = "scan";
+	private final String resumptionXpath = "substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')";

-	private final String resultSizeParam = "";
-	private final String resultSizeValue = "";
+	private final String resultSizeParam = "page[size]";
+	private final String resultSizeValue = "100";

 	private final String resultFormatParam = "format";
 	private final String resultFormatValue = "json";
@ -74,7 +74,7 @@ public class OsfPreprintCollectorTest {
 		final AtomicInteger i = new AtomicInteger(0);
 		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());

-		stream.limit(200).forEach(s -> {
+		stream.limit(2000).forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			i.incrementAndGet();
 			log.info(s);
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
@ -4,6 +4,11 @@

 package eu.dnetlib.dhp.collection.plugin.rest;

+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Stream;
@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import com.google.gson.Gson;
+
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
 import eu.dnetlib.dhp.common.collection.CollectorException;
@ -25,18 +32,18 @@ class RestCollectorPluginTest {

 	private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);

-	private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
-	private final String resumptionType = "count";
-	private final String resumptionParam = "from";
-	private final String entityXpath = "//hits/hits";
-	private final String resumptionXpath = "//hits";
-	private final String resultTotalXpath = "//hits/total";
-	private final String resultFormatParam = "format";
+	private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
+	private final String resumptionType = "discover";
+	private final String resumptionParam = "skip";
+	private final String entityXpath = "//*[local-name()='data']";
+	private final String resumptionXpath = "";
+	private final String resultTotalXpath = "//*[local-name()='count']";
+	private final String resultFormatParam = "";
 	private final String resultFormatValue = "json";
-	private final String resultSizeParam = "size";
+	private final String resultSizeParam = "top";
 	private final String resultSizeValue = "10";
 	// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
-	private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
+	private final String query = "";
 	// private String query = "=(sources:engrXiv AND type:preprint)";

 	private final String protocolDescriptor = "rest_json2xml";
@ -56,6 +63,7 @@ class RestCollectorPluginTest {
 		params.put("resultSizeValue", resultSizeValue);
 		params.put("queryParams", query);
 		params.put("entityXpath", entityXpath);
+		params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");

 		api.setBaseUrl(baseUrl);
 		api.setParams(params);
@ -78,4 +86,19 @@ class RestCollectorPluginTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
+
+	@Disabled
+	@Test
+	void testUrl() throws IOException {
+		String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
+		URL url = new URL(url_s);
+		final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestMethod("GET");
+		conn.setRequestProperty("User-Agent", "OpenAIRE");
+		Gson gson = new Gson();
+		System.out.println("Request header");
+		System.out.println(gson.toJson(conn.getHeaderFields()));
+		InputStream inputStream = conn.getInputStream();
+
+	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
@ -44,7 +44,7 @@ public class RestIteratorTest {

 		final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
 			resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
-			query, entityXpath, authMethod, authToken, resultOffsetParam);
+			query, entityXpath, authMethod, authToken, resultOffsetParam, null);
 		int i = 20;
 		while (iterator.hasNext() && i > 0) {
 			String result = iterator.next();
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/xslt/DataFetcherTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/xslt/DataFetcherTest.java
@ -1,68 +0,0 @@
-package eu.dnetlib.dhp.transformation.xslt;
-
-import org.json.JSONObject;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.util.List;
-
-import static org.junit.jupiter.api.Assertions.*;
-
-class DataFetcherTest {
-
-    @BeforeEach
-    void setUp() {
-    }
-
-    @AfterEach
-    void tearDown() {
-    }
-
-    @Test
-    void getJson() throws IOException, URISyntaxException {
-        URL contributorsUrl = new URI("https://api.osf.io/v2/preprints/mrwqb/contributors/?format=json").toURL();
-        JSONObject testJsonObj = DataFetcher.getJson(contributorsUrl);
-
-        String x = testJsonObj
-                .getJSONArray("data")
-                .getJSONObject(0)
-                .getJSONObject("embeds")
-                .getJSONObject("users")
-                .getJSONObject("data")
-                .getJSONObject("attributes")
-                .getString("full_name");
-        System.out.println(x);
-        System.out.println(testJsonObj.getJSONArray("data").length());
-        testJsonObj.getJSONArray("data").forEach(System.out::println);
-    }
-
-    @Test
-    void getAuthorsFromJson() throws IOException, URISyntaxException {
-        URL contributorsUrl = new URI("https://api.osf.io/v2/preprints/mrwqb/contributors/?format=json").toURL();
-        JSONObject testJsonObj = DataFetcher.getJson(contributorsUrl);
-        List<String> authors = DataFetcher.getAuthorsFromJson(testJsonObj);
-        System.out.println(authors);
-        System.out.println(DataFetcher.transformListToDublinCore(authors));
-    }
-
-    @Test
-    void getAndTransformAuthors() throws IOException, URISyntaxException {
-        URL contributorsUrl = new URI("https://api.osf.io/v2/preprints/mrwqb/contributors/?format=json").toURL();
-        System.out.println(DataFetcher.getAndTransformAuthors(contributorsUrl));
-    }
-
-
-    @Test
-    void getLinkToFulltextFromJson() throws URISyntaxException, IOException {
-        URL linkToFullTextDocument = new URI("https://api.osf.io/v2/files/5de7c96f84c479000c7928af/?format=json").toURL();
-        System.out.println(DataFetcher.getFullTextLinkAndTransform(linkToFullTextDocument));
-
-
-    }
-}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
@ -789,10 +789,6 @@
      "value": "2227-9717",
      "type": "electronic"
    },
-    {
-      "value": "VALUE",
-      "type": "PIPPO"
-    },
    {
      "value": "1063-4584",
      "type": "pu"
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/dblp.gz
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pdb_dump
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pdb_dump
@ -1,44 +1,15 @@
-{"classification": "Signaling protein", "pdb": "5NM4", "deposition_date": "2017-04-05", "title": "A2a adenosine receptor room-temperature structure determined by serial Femtosecond crystallography", "Keywords": ["Oom-temperature", " serial crystallography", " signaling protein"], "authors": ["T.weinert", "R.cheng", "D.james", "D.gashi", "P.nogly", "K.jaeger", "M.hennig", "", "J.standfuss"], "pmid": "28912485", "doi": "10.1038/S41467-017-00630-4"}
-{"classification": "Oxidoreductase/oxidoreductase inhibitor", "pdb": "4KN3", "deposition_date": "2013-05-08", "title": "Structure of the y34ns91g double mutant of dehaloperoxidase from Amphitrite ornata with 2,4,6-trichlorophenol", "Keywords": ["Lobin", " oxygen storage", " peroxidase", " oxidoreductase", " oxidoreductase-", "Oxidoreductase inhibitor complex"], "authors": ["C.wang", "L.lovelace", "L.lebioda"], "pmid": "23952341", "doi": "10.1021/BI400627W"}
-{"classification": "Transport protein", "pdb": "8HKM", "deposition_date": "2022-11-27", "title": "Ion channel", "Keywords": ["On channel", " transport protein"], "authors": ["D.h.jiang", "J.t.zhang"], "pmid": "37494189", "doi": "10.1016/J.CELREP.2023.112858"}
-{"classification": "Signaling protein", "pdb": "6JT1", "deposition_date": "2019-04-08", "title": "Structure of human soluble guanylate cyclase in the heme oxidised State", "Keywords": ["Oluble guanylate cyclase", " signaling protein"], "authors": ["L.chen", "Y.kang", "R.liu", "J.-x.wu"], "pmid": "31514202", "doi": "10.1038/S41586-019-1584-6"}
-{"classification": "Immune system", "pdb": "7OW6", "deposition_date": "2021-06-16", "title": "Crystal structure of a tcr in complex with hla-a*11:01 bound to kras G12d peptide (vvvgadgvgk)", "Keywords": ["La", " kras", " tcr", " immune system"], "authors": ["V.karuppiah", "R.a.robinson"], "doi": "10.1038/S41467-022-32811-1"}
-{"classification": "Biosynthetic protein", "pdb": "5EQ8", "deposition_date": "2015-11-12", "title": "Crystal structure of medicago truncatula histidinol-phosphate Phosphatase (mthpp) in complex with l-histidinol", "Keywords": ["Istidine biosynthesis", " metabolic pathways", " dimer", " plant", "", "Biosynthetic protein"], "authors": ["M.ruszkowski", "Z.dauter"], "pmid": "26994138", "doi": "10.1074/JBC.M115.708727"}
-{"classification": "De novo protein", "pdb": "8CWA", "deposition_date": "2022-05-18", "title": "Solution nmr structure of 8-residue rosetta-designed cyclic peptide D8.21 in cdcl3 with cis/trans switching (tc conformation, 53%)", "Keywords": ["Yclic peptide", " non natural amino acids", " cis/trans", " switch peptides", "", "De novo design", "Membrane permeability", "De novo protein"], "authors": ["T.a.ramelot", "R.tejero", "G.t.montelione"], "pmid": "36041435", "doi": "10.1016/J.CELL.2022.07.019"}
-{"classification": "Hydrolase", "pdb": "3R6M", "deposition_date": "2011-03-21", "title": "Crystal structure of vibrio parahaemolyticus yeaz", "Keywords": ["Ctin/hsp70 nucleotide-binding fold", " bacterial resuscitation", " viable", "But non-culturable state", "Resuscitation promoting factor", "Ygjd", "", "Yjee", "Vibrio parahaemolyticus", "Hydrolase"], "authors": ["A.roujeinikova", "I.aydin"], "pmid": "21858042", "doi": "10.1371/JOURNAL.PONE.0023245"}
-{"classification": "Hydrolase", "pdb": "2W5J", "deposition_date": "2008-12-10", "title": "Structure of the c14-rotor ring of the proton translocating Chloroplast atp synthase", "Keywords": ["Ydrolase", " chloroplast", " atp synthase", " lipid-binding", " cf(0)", " membrane", "", "Transport", "Formylation", "Energy transduction", "Hydrogen ion transport", "", "Ion transport", "Transmembrane", "Membrane protein"], "authors": ["M.vollmar", "D.schlieper", "M.winn", "C.buechner", "G.groth"], "pmid": "19423706", "doi": "10.1074/JBC.M109.006916"}
-{"classification": "De novo protein", "pdb": "4GLU", "deposition_date": "2012-08-14", "title": "Crystal structure of the mirror image form of vegf-a", "Keywords": ["-protein", " covalent dimer", " cysteine knot protein", " growth factor", " de", "Novo protein"], "authors": ["K.mandal", "M.uppalapati", "D.ault-riche", "J.kenney", "J.lowitz", "S.sidhu", "", "S.b.h.kent"], "pmid": "22927390", "doi": "10.1073/PNAS.1210483109"}
-{"classification": "Hydrolase/hydrolase inhibitor", "pdb": "3WYL", "deposition_date": "2014-09-01", "title": "Crystal structure of the catalytic domain of pde10a complexed with 5- Methoxy-3-(1-phenyl-1h-pyrazol-5-yl)-1-(3-(trifluoromethyl)phenyl) Pyridazin-4(1h)-one", "Keywords": ["Ydrolase-hydrolase inhibitor complex"], "authors": ["H.oki", "Y.hayano"], "pmid": "25384088", "doi": "10.1021/JM5013648"}
-{"classification": "Isomerase", "pdb": "5BOR", "deposition_date": "2015-05-27", "title": "Structure of acetobacter aceti pure-s57c, sulfonate form", "Keywords": ["Cidophile", " pure", " purine biosynthesis", " isomerase"], "authors": ["K.l.sullivan", "T.j.kappock"]}
-{"classification": "Hydrolase", "pdb": "1X0C", "deposition_date": "2005-03-17", "title": "Improved crystal structure of isopullulanase from aspergillus niger Atcc 9642", "Keywords": ["Ullulan", " glycoside hydrolase family 49", " glycoprotein", " hydrolase"], "authors": ["M.mizuno", "T.tonozuka", "A.yamamura", "Y.miyasaka", "H.akeboshi", "S.kamitori", "", "A.nishikawa", "Y.sakano"], "pmid": "18155243", "doi": "10.1016/J.JMB.2007.11.098"}
-{"classification": "Oxidoreductase", "pdb": "7CUP", "deposition_date": "2020-08-23", "title": "Structure of 2,5-dihydroxypridine dioxygenase from pseudomonas putida Kt2440", "Keywords": ["On-heme dioxygenase", " oxidoreductase"], "authors": ["G.q.liu", "H.z.tang"]}
-{"classification": "Ligase", "pdb": "1VCN", "deposition_date": "2004-03-10", "title": "Crystal structure of t.th. hb8 ctp synthetase complex with sulfate Anion", "Keywords": ["Etramer", " riken structural genomics/proteomics initiative", " rsgi", "", "Structural genomics", "Ligase"], "authors": ["M.goto", "Riken structural genomics/proteomics initiative (rsgi)"], "pmid": "15296735", "doi": "10.1016/J.STR.2004.05.013"}
-{"classification": "Transferase/transferase inhibitor", "pdb": "6C9V", "deposition_date": "2018-01-28", "title": "Mycobacterium tuberculosis adenosine kinase bound to (2r,3s,4r,5r)-2- (hydroxymethyl)-5-(6-(4-phenylpiperazin-1-yl)-9h-purin-9-yl) Tetrahydrofuran-3,4-diol", "Keywords": ["Ucleoside analog", " complex", " inhibitor", " structural genomics", " psi-2", "", "Protein structure initiative", "Tb structural genomics consortium", "", "Tbsgc", "Transferase-transferase inhibitor complex"], "authors": ["R.a.crespo", "Tb structural genomics consortium (tbsgc)"], "pmid": "31002508", "doi": "10.1021/ACS.JMEDCHEM.9B00020"}
-{"classification": "De novo protein", "pdb": "4LPY", "deposition_date": "2013-07-16", "title": "Crystal structure of tencon variant g10", "Keywords": ["Ibronectin type iii fold", " alternate scaffold", " de novo protein"], "authors": ["A.teplyakov", "G.obmolova", "G.l.gilliland"], "pmid": "24375666", "doi": "10.1002/PROT.24502"}
-{"classification": "Isomerase", "pdb": "2Y88", "deposition_date": "2011-02-03", "title": "Crystal structure of mycobacterium tuberculosis phosphoribosyl Isomerase (variant d11n) with bound prfar", "Keywords": ["Romatic amino acid biosynthesis", " isomerase", " tim-barrel", " histidine", "Biosynthesis", "Tryptophan biosynthesis"], "authors": ["J.kuper", "A.v.due", "A.geerlof", "M.wilmanns"], "pmid": "21321225", "doi": "10.1073/PNAS.1015996108"}
-{"classification": "Unknown function", "pdb": "1SR0", "deposition_date": "2004-03-22", "title": "Crystal structure of signalling protein from sheep(sps-40) at 3.0a Resolution using crystal grown in the presence of polysaccharides", "Keywords": ["Ignalling protein", " involution", " unknown function"], "authors": ["D.b.srivastava", "A.s.ethayathulla", "N.singh", "J.kumar", "S.sharma", "T.p.singh"]}
-{"classification": "Dna binding protein", "pdb": "3RH2", "deposition_date": "2011-04-11", "title": "Crystal structure of a tetr-like transcriptional regulator (sama_0099) From shewanella amazonensis sb2b at 2.42 a resolution", "Keywords": ["Na/rna-binding 3-helical bundle", " structural genomics", " joint center", "For structural genomics", "Jcsg", "Protein structure initiative", "Psi-", "Biology", "Dna binding protein"], "authors": ["Joint center for structural genomics (jcsg)"]}
-{"classification": "Transferase", "pdb": "2WK5", "deposition_date": "2009-06-05", "title": "Structural features of native human thymidine phosphorylase And in complex with 5-iodouracil", "Keywords": ["Lycosyltransferase", " developmental protein", " angiogenesis", "", "5-iodouracil", "Growth factor", "Enzyme kinetics", "", "Differentiation", "Disease mutation", "Thymidine", "Phosphorylase", "Chemotaxis", "Transferase", "Mutagenesis", "", "Polymorphism"], "authors": ["E.mitsiki", "A.c.papageorgiou", "S.iyer", "N.thiyagarajan", "S.h.prior", "", "D.sleep", "C.finnis", "K.r.acharya"], "pmid": "19555658", "doi": "10.1016/J.BBRC.2009.06.104"}
-{"classification": "Hydrolase", "pdb": "3P9Y", "deposition_date": "2010-10-18", "title": "Crystal structure of the drosophila melanogaster ssu72-pctd complex", "Keywords": ["Hosphatase", " cis proline", " lmw ptp-like fold", " rna polymerase ii ctd", "", "Hydrolase"], "authors": ["J.w.werner-allen", "P.zhou"], "pmid": "21159777", "doi": "10.1074/JBC.M110.197129"}
-{"classification": "Recombination/dna", "pdb": "6OEO", "deposition_date": "2019-03-27", "title": "Cryo-em structure of mouse rag1/2 nfc complex (dna1)", "Keywords": ["(d)j recombination", " dna transposition", " rag", " scid", " recombination", "", "Recombination-dna complex"], "authors": ["X.chen", "Y.cui", "Z.h.zhou", "W.yang", "M.gellert"], "pmid": "32015552", "doi": "10.1038/S41594-019-0363-2"}
-{"classification": "Hydrolase", "pdb": "4ECA", "deposition_date": "1997-02-21", "title": "Asparaginase from e. coli, mutant t89v with covalently bound aspartate", "Keywords": ["Ydrolase", " acyl-enzyme intermediate", " threonine amidohydrolase"], "authors": ["G.j.palm", "J.lubkowski", "A.wlodawer"], "pmid": "8706862", "doi": "10.1016/0014-5793(96)00660-6"}
-{"classification": "Transcription/protein binding", "pdb": "3UVX", "deposition_date": "2011-11-30", "title": "Crystal structure of the first bromodomain of human brd4 in complex With a diacetylated histone 4 peptide (h4k12ack16ac)", "Keywords": ["Romodomain", " bromodomain containing protein 4", " cap", " hunk1", " mcap", "", "Mitotic chromosome associated protein", "Peptide complex", "Structural", "Genomics consortium", "Sgc", "Transcription-protein binding complex"], "authors": ["P.filippakopoulos", "S.picaud", "T.keates", "E.ugochukwu", "F.von delft", "", "C.h.arrowsmith", "A.m.edwards", "J.weigelt", "C.bountra", "S.knapp", "Structural", "Genomics consortium (sgc)"], "pmid": "22464331", "doi": "10.1016/J.CELL.2012.02.013"}
-{"classification": "Membrane protein", "pdb": "1TLZ", "deposition_date": "2004-06-10", "title": "Tsx structure complexed with uridine", "Keywords": ["Ucleoside transporter", " beta barrel", " uridine", " membrane", "Protein"], "authors": ["J.ye", "B.van den berg"], "pmid": "15272310", "doi": "10.1038/SJ.EMBOJ.7600330"}
-{"classification": "Dna binding protein", "pdb": "7AZD", "deposition_date": "2020-11-16", "title": "Dna polymerase sliding clamp from escherichia coli with peptide 20 Bound", "Keywords": ["Ntibacterial drug", " dna binding protein"], "authors": ["C.monsarrat", "G.compain", "C.andre", "I.martiel", "S.engilberge", "V.olieric", "", "P.wolff", "K.brillet", "M.landolfo", "C.silva da veiga", "J.wagner", "G.guichard", "", "D.y.burnouf"], "pmid": "34806883", "doi": "10.1021/ACS.JMEDCHEM.1C00918"}
-{"classification": "Transferase", "pdb": "5N3K", "deposition_date": "2017-02-08", "title": "Camp-dependent protein kinase a from cricetulus griseus in complex With fragment like molecule o-guanidino-l-homoserine", "Keywords": ["Ragment", " complex", " transferase", " serine threonine kinase", " camp", "", "Kinase", "Pka"], "authors": ["C.siefker", "A.heine", "G.klebe"]}
-{"classification": "Biosynthetic protein", "pdb": "8H52", "deposition_date": "2022-10-11", "title": "Crystal structure of helicobacter pylori carboxyspermidine Dehydrogenase in complex with nadp", "Keywords": ["Arboxyspermidine dehydrogenase", " biosynthetic protein"], "authors": ["K.y.ko", "S.c.park", "S.y.cho", "S.i.yoon"], "pmid": "36283333", "doi": "10.1016/J.BBRC.2022.10.049"}
-{"classification": "Metal binding protein", "pdb": "6DYC", "deposition_date": "2018-07-01", "title": "Co(ii)-bound structure of the engineered cyt cb562 variant, ch3", "Keywords": ["Esigned protein", " 4-helix bundle", " electron transport", " metal binding", "Protein"], "authors": ["F.a.tezcan", "J.rittle"], "pmid": "30778140", "doi": "10.1038/S41557-019-0218-9"}
-{"classification": "Protein fibril", "pdb": "6A6B", "deposition_date": "2018-06-27", "title": "Cryo-em structure of alpha-synuclein fiber", "Keywords": ["Lpha-syn fiber", " parkinson disease", " protein fibril"], "authors": ["Y.w.li", "C.y.zhao", "F.luo", "Z.liu", "X.gui", "Z.luo", "X.zhang", "D.li", "C.liu", "X.li"], "pmid": "30065316", "doi": "10.1038/S41422-018-0075-X"}
-{"classification": "Dna", "pdb": "7D5E", "deposition_date": "2020-09-25", "title": "Left-handed g-quadruplex containing two bulges", "Keywords": ["-quadruplex", " bulge", " dna", " left-handed"], "authors": ["P.das", "A.maity", "K.h.ngo", "F.r.winnerdy", "B.bakalar", "Y.mechulam", "E.schmitt", "", "A.t.phan"], "pmid": "33503265", "doi": "10.1093/NAR/GKAA1259"}
-{"classification": "Transferase", "pdb": "3RSY", "deposition_date": "2011-05-02", "title": "Cellobiose phosphorylase from cellulomonas uda in complex with sulfate And glycerol", "Keywords": ["H94", " alpha barrel", " cellobiose phosphorylase", " disaccharide", "Phosphorylase", "Transferase"], "authors": ["A.van hoorebeke", "J.stout", "W.soetaert", "J.van beeumen", "T.desmet", "S.savvides"]}
-{"classification": "Oxidoreductase", "pdb": "7MCI", "deposition_date": "2021-04-02", "title": "Mofe protein from azotobacter vinelandii with a sulfur-replenished Cofactor", "Keywords": ["Zotobacter vinelandii", " mofe-protein", " nitrogenase", " oxidoreductase"], "authors": ["W.kang", "C.lee", "Y.hu", "M.w.ribbe"], "doi": "10.1038/S41929-022-00782-7"}
-{"classification": "Dna", "pdb": "1XUW", "deposition_date": "2004-10-26", "title": "Structural rationalization of a large difference in rna affinity Despite a small difference in chemistry between two 2'-o-modified Nucleic acid analogs", "Keywords": ["Na mimetic methylcarbamate amide analog", " dna"], "authors": ["R.pattanayek", "L.sethaphong", "C.pan", "M.prhavc", "T.p.prakash", "M.manoharan", "", "M.egli"], "pmid": "15547979", "doi": "10.1021/JA044637K"}
-{"classification": "Lyase", "pdb": "7C0D", "deposition_date": "2020-05-01", "title": "Crystal structure of azospirillum brasilense l-2-keto-3-deoxyarabonate Dehydratase (hydroxypyruvate-bound form)", "Keywords": ["-2-keto-3-deoxyarabonate dehydratase", " lyase"], "authors": ["Y.watanabe", "S.watanabe"], "pmid": "32697085", "doi": "10.1021/ACS.BIOCHEM.0C00515"}
-{"classification": "Signaling protein", "pdb": "5LYK", "deposition_date": "2016-09-28", "title": "Crystal structure of intracellular b30.2 domain of btn3a1 bound to Citrate", "Keywords": ["30.2", " butyrophilin", " signaling protein"], "authors": ["F.mohammed", "A.t.baker", "M.salim", "B.e.willcox"], "pmid": "28862425", "doi": "10.1021/ACSCHEMBIO.7B00694"}
-{"classification": "Toxin", "pdb": "4IZL", "deposition_date": "2013-01-30", "title": "Structure of the n248a mutant of the panton-valentine leucocidin s Component from staphylococcus aureus", "Keywords": ["I-component leucotoxin", " staphylococcus aureus", " s component", "Leucocidin", "Beta-barrel pore forming toxin", "Toxin"], "authors": ["L.maveyraud", "B.j.laventie", "G.prevost", "L.mourey"], "pmid": "24643034", "doi": "10.1371/JOURNAL.PONE.0092094"}
-{"classification": "Dna", "pdb": "6F3C", "deposition_date": "2017-11-28", "title": "The cytotoxic [pt(h2bapbpy)] platinum complex interacting with the Cgtacg hexamer", "Keywords": ["Rug-dna complex", " four-way junction", " dna"], "authors": ["M.ferraroni", "C.bazzicalupi", "P.gratteri", "F.papi"], "pmid": "31046177", "doi": "10.1002/ANIE.201814532"}
-{"classification": "Signaling protein/inhibitor", "pdb": "4L5M", "deposition_date": "2013-06-11", "title": "Complexe of arno sec7 domain with the protein-protein interaction Inhibitor n-(4-hydroxy-2,6-dimethylphenyl)benzenesulfonamide at ph6.5", "Keywords": ["Ec-7domain", " signaling protein-inhibitor complex"], "authors": ["F.hoh", "J.rouhana"], "pmid": "24112024", "doi": "10.1021/JM4009357"}
-{"classification": "Signaling protein", "pdb": "5I6J", "deposition_date": "2016-02-16", "title": "Crystal structure of srgap2 f-barx", "Keywords": ["Rgap2", " f-bar", " fx", " signaling protein"], "authors": ["M.sporny", "J.guez-haddad", "M.n.isupov", "Y.opatowsky"], "pmid": "28333212", "doi": "10.1093/MOLBEV/MSX094"}
-{"classification": "Metal binding protein", "pdb": "1Q80", "deposition_date": "2003-08-20", "title": "Solution structure and dynamics of nereis sarcoplasmic calcium binding Protein", "Keywords": ["Ll-alpha", " metal binding protein"], "authors": ["G.rabah", "R.popescu", "J.a.cox", "Y.engelborghs", "C.t.craescu"], "pmid": "15819893", "doi": "10.1111/J.1742-4658.2005.04629.X"}
-{"classification": "Transferase", "pdb": "1TW1", "deposition_date": "2004-06-30", "title": "Beta-1,4-galactosyltransferase mutant met344his (m344h-gal-t1) complex With udp-galactose and magnesium", "Keywords": ["Et344his mutation; closed conformation; mn binding", " transferase"], "authors": ["B.ramakrishnan", "E.boeggeman", "P.k.qasba"], "pmid": "15449940", "doi": "10.1021/BI049007+"}
-{"classification": "Rna", "pdb": "2PN4", "deposition_date": "2007-04-23", "title": "Crystal structure of hepatitis c virus ires subdomain iia", "Keywords": ["Cv", " ires", " subdoamin iia", " rna", " strontium", " hepatitis"], "authors": ["Q.zhao", "Q.han", "C.r.kissinger", "P.a.thompson"], "pmid": "18391410", "doi": "10.1107/S0907444908002011"}
+{"pdb": "1CW0", "title": "crystal structure analysis of very short patch repair (vsr) endonuclease in complex with a duplex dna", "authors": ["S.E.Tsutakawa", "H.Jingami", "K.Morikawa"], "doi": "10.1016/S0092-8674(00)81550-0", "pmid": "10612397"}
+{"pdb": "2CWW", "title": "crystal structure of thermus thermophilus ttha1280, a putative sam- dependent rna methyltransferase, in complex with s-adenosyl-l- homocysteine", "authors": ["A.A.Pioszak", "K.Murayama", "N.Nakagawa", "A.Ebihara", "S.Kuramitsu", "M.Shirouzu", "S.Yokoyama", "Riken Structural Genomics/proteomics Initiative (Rsgi)"], "doi": "10.1107/S1744309105029842", "pmid": "16511182"}
+{"pdb": "6CWE", "title": "structure of alpha-gsa[8,6p] bound by cd1d and in complex with the va14vb8.2 tcr", "authors": ["J.Wang", "D.Zajonc"], "doi": null, "pmid": null}
+{"pdb": "5CWS", "title": "crystal structure of the intact chaetomium thermophilum nsp1-nup49- nup57 channel nucleoporin heterotrimer bound to its nic96 nuclear pore complex attachment site", "authors": ["C.J.Bley", "S.Petrovic", "M.Paduch", "V.Lu", "A.A.Kossiakoff", "A.Hoelz"], "doi": "10.1126/SCIENCE.AAC9176", "pmid": "26316600"}
+{"pdb": "5CWE", "title": "structure of cyp107l2 from streptomyces avermitilis with lauric acid", "authors": ["T.-V.Pham", "S.-H.Han", "J.-H.Kim", "D.-H.Kim", "L.-W.Kang"], "doi": null, "pmid": null}
+{"pdb": "7CW4", "title": "acetyl-coa acetyltransferase from bacillus cereus atcc 14579", "authors": ["J.Hong", "K.J.Kim"], "doi": "10.1016/J.BBRC.2020.09.048", "pmid": "32972748"}
+{"pdb": "2CWP", "title": "crystal structure of metrs related protein from pyrococcus horikoshii", "authors": ["K.Murayama", "M.Kato-Murayama", "M.Shirouzu", "S.Yokoyama", "Riken StructuralGenomics/proteomics Initiative (Rsgi)"], "doi": null, "pmid": null}
+{"pdb": "2CW7", "title": "crystal structure of intein homing endonuclease ii", "authors": ["H.Matsumura", "H.Takahashi", "T.Inoue", "H.Hashimoto", "M.Nishioka", "S.Fujiwara", "M.Takagi", "T.Imanaka", "Y.Kai"], "doi": "10.1002/PROT.20858", "pmid": "16493661"}
+{"pdb": "1CWU", "title": "brassica napus enoyl acp reductase a138g mutant complexed with nad+ and thienodiazaborine", "authors": ["A.Roujeinikova", "J.B.Rafferty", "D.W.Rice"], "doi": "10.1074/JBC.274.43.30811", "pmid": "10521472"}
+{"pdb": "3CWN", "title": "escherichia coli transaldolase b mutant f178y", "authors": ["T.Sandalova", "G.Schneider", "A.Samland"], "doi": "10.1074/JBC.M803184200", "pmid": "18687684"}
+{"pdb": "1CWL", "title": "human cyclophilin a complexed with 4 4-hydroxy-meleu cyclosporin", "authors": ["V.Mikol", "J.Kallen", "P.Taylor", "M.D.Walkinshaw"], "doi": "10.1006/JMBI.1998.2108", "pmid": "9769216"}
+{"pdb": "3CW2", "title": "crystal structure of the intact archaeal translation initiation factor 2 from sulfolobus solfataricus .", "authors": ["E.A.Stolboushkina", "S.V.Nikonov", "A.D.Nikulin", "U.Blaesi", "D.J.Manstein", "R.V.Fedorov", "M.B.Garber", "O.S.Nikonov"], "doi": "10.1016/J.JMB.2008.07.039", "pmid": "18675278"}
+{"pdb": "3CW9", "title": "4-chlorobenzoyl-coa ligase/synthetase in the thioester-forming conformation, bound to 4-chlorophenacyl-coa", "authors": ["A.S.Reger", "J.Cao", "R.Wu", "D.Dunaway-Mariano", "A.M.Gulick"], "doi": "10.1021/BI800696Y", "pmid": "18620418"}
+{"pdb": "3CWU", "title": "crystal structure of an alka host/guest complex 2'-fluoro-2'-deoxy-1, n6-ethenoadenine:thymine base pair", "authors": ["B.R.Bowman", "S.Lee", "S.Wang", "G.L.Verdine"], "doi": "10.1016/J.STR.2008.04.012", "pmid": "18682218"}
+{"pdb": "5CWF", "title": "crystal structure of de novo designed helical repeat protein dhr8", "authors": ["G.Bhabha", "D.C.Ekiert"], "doi": "10.1038/NATURE16162", "pmid": "26675729"}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump
@ -1,36 +1,6 @@
-{"pid": " Q6GZX4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 43"}], "title": "Putative transcription factor 001R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZX3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 45"}], "title": "Uncharacterized protein 002L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q197F8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 29"}], "title": "Uncharacterized protein 002R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
-{"pid": " Q197F7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 23"}], "title": "Uncharacterized protein 003L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
-{"pid": " Q6GZX2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 3R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZX1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 38"}], "title": "Uncharacterized protein 004R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q197F5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 005L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
-{"pid": " Q6GZX0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 47"}], "title": "Uncharacterized protein 005R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q91G88", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-06-28", "date_info": "entry version 53"}], "title": "Putative KilA-N domain-containing protein 006L", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
-{"pid": " Q6GZW9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 34"}], "title": "Uncharacterized protein 006R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZW8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 32"}], "title": "Uncharacterized protein 007R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q197F3", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 007R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
-{"pid": " Q197F2", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 22"}], "title": "Uncharacterized protein 008L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
-{"pid": " Q6GZW6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 67"}], "title": "Putative helicase 009L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q91G85", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 38"}], "title": "Uncharacterized protein 009R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
-{"pid": " Q6GZW5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 010R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q197E9", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 011L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
-{"pid": " Q6GZW4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 011R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZW3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 012L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q197E7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 37"}], "title": "Uncharacterized protein IIV3-013L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
-{"pid": " Q6GZW2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 30"}], "title": "Uncharacterized protein 013R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZW1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 014R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZW0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 50"}], "title": "Uncharacterized protein 015R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZV8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 017L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZV7", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 018L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZV6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 87"}], "title": "Putative serine/threonine-protein kinase 019R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZV5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 40"}], "title": "Uncharacterized protein 020R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZV4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 021L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q197D8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-12-14", "date_info": "entry version 35"}], "title": "Transmembrane protein 022L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
-{"pid": " Q6GZV2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 023R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q197D7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 25"}], "title": "Uncharacterized protein 023R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
-{"pid": " Q6GZV1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 024R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q197D5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 24"}], "title": "Uncharacterized protein 025R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
-{"pid": " Q91G70", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 026R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
-{"pid": " Q6GZU9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 49"}], "title": "Uncharacterized protein 027R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
-{"pid": " Q6GZU8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 55"}], "title": "Uncharacterized protein 028R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
+{"pid": "Q6GZX4", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 41."}], "title": "Putative transcription factor 001R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
+{"pid": "Q6GZX3", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 42."}], "title": "Uncharacterized protein 002L;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
+{"pid": "Q197F8", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 27."}], "title": "Uncharacterized protein 002R;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]}
+{"pid": "Q197F7", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 23."}], "title": "Uncharacterized protein 003L;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]}
+{"pid": "Q6GZX2", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 36."}], "title": "Uncharacterized protein 3R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
+{"pid": "Q6GZX1", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 34."}], "title": "Uncharacterized protein 004R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref

 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
-import org.junit.jupiter.api.BeforeEach
+import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
+import org.apache.commons.io.IOUtils
+import org.junit.jupiter.api.{BeforeEach, Test}
 import org.junit.jupiter.api.extension.ExtendWith
 import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.{Logger, LoggerFactory}
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
    super.setUpVocabulary()
  }

+  @Test
+  def mappingRecord(): Unit = {
+    val input =
+      IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
+
+    println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
+
+  }
+
 }
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension

 import java.io.{BufferedReader, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
+import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {

  @Test
  def testEBIData() = {
-    val inputXML = Source
-      .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-      .mkString
-    val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
+    val inputFactory = XMLInputFactory.newInstance
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
    new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
  }

@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {

  @Test
  def testParsingPubmedXML(): Unit = {
-    val xml = new XMLEventReader(
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+    val inputFactory = XMLInputFactory.newInstance
+
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
    val parser = new PMParser(xml)
    parser.foreach(checkPMArticle)
  }
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testPubmedMapping(): Unit = {

-    val xml = new XMLEventReader(
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+    val inputFactory = XMLInputFactory.newInstance
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
    val parser = new PMParser(xml)
    val results = ListBuffer[Oaf]()
    parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
@ -26,15 +26,15 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.Software;

-public class PrepareSimpleEntitiesJob {
+public class PrepareSimpleEntititiesJob {

-	private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntitiesJob.class);
+	private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntititiesJob.class);

 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					PrepareSimpleEntitiesJob.class
+					PrepareSimpleEntititiesJob.class
 						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
 		parser.parseArgument(args);

--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java
@ -160,7 +160,8 @@ public class ConversionUtils {
 			.stream()
 			.filter(Objects::nonNull)
 			.filter(pid -> pid.getQualifier() != null)
-			.filter(pid -> StringUtils.startsWithIgnoreCase(pid.getQualifier().getClassid(), ModelConstants.ORCID))
+			.filter(pid -> pid.getQualifier().getClassid() != null)
+			.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
 			.map(StructuredProperty::getValue)
 			.map(ConversionUtils::cleanOrcid)
 			.filter(StringUtils::isNotBlank)
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
@ -7,7 +7,7 @@
        </property>
        <property>
            <name>outputDir</name>
-            <description>the path where the generated data will be stored</description>
+            <description>the path where the the generated data will be stored</description>
        </property>
 		<property>
            <name>datasourceIdWhitelist</name>
@ -179,18 +179,17 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>PrepareSimpleEntititiesJob</name>
-            <class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntitiesJob</class>
+            <class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntititiesJob</class>
            <jar>dhp-broker-events-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=5000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -210,12 +209,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -236,12 +234,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -261,12 +258,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=5000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -286,12 +282,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=10000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -311,12 +306,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=2000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -338,12 +332,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -363,12 +356,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -388,12 +380,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -413,12 +404,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -438,12 +428,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -463,12 +452,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -488,12 +476,11 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=8000
+                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--workingDir</arg><arg>${workingDir}</arg>
            <arg>--outputDir</arg><arg>${outputDir}</arg>
@ -516,7 +503,6 @@
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing} 
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -549,7 +535,6 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -577,7 +562,6 @@
                --executor-cores=${sparkExecutorCores}
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -601,7 +585,6 @@
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing} 
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcidTest.java
+++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcidTest.java
@ -1,64 +0,0 @@
-package eu.dnetlib.dhp.broker.oa.matchers.simple;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-import java.util.List;
-
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import eu.dnetlib.broker.objects.OaBrokerAuthor;
-import eu.dnetlib.broker.objects.OaBrokerMainEntity;
-
-class EnrichMissingAuthorOrcidTest {
-
-	final EnrichMissingAuthorOrcid matcher = new EnrichMissingAuthorOrcid();
-
-	@BeforeEach
-	void setUp() throws Exception {}
-
-	@Test
-	void testFindDifferences_1() {
-		final OaBrokerMainEntity source = new OaBrokerMainEntity();
-		final OaBrokerMainEntity target = new OaBrokerMainEntity();
-		final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
-		assertTrue(list.isEmpty());
-	}
-
-	@Test
-	void testFindDifferences_2() {
-		final OaBrokerMainEntity source = new OaBrokerMainEntity();
-		final OaBrokerMainEntity target = new OaBrokerMainEntity();
-
-		source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
-		target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
-
-		final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
-		assertEquals(1, list.size());
-	}
-
-	@Test
-	void testFindDifferences_3() {
-		final OaBrokerMainEntity source = new OaBrokerMainEntity();
-		final OaBrokerMainEntity target = new OaBrokerMainEntity();
-
-		source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
-		target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
-
-		final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
-		assertTrue(list.isEmpty());
-	}
-
-	@Test
-	void testFindDifferences_4() {
-		final OaBrokerMainEntity source = new OaBrokerMainEntity();
-		final OaBrokerMainEntity target = new OaBrokerMainEntity();
-		source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
-		target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
-
-		final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
-		assertTrue(list.isEmpty());
-	}
-
-}
--- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java
+++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java
@ -2,31 +2,27 @@
 package eu.dnetlib.dhp.broker.oa.util;

 import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNull;

 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;

 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;

-import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.broker.objects.OaBrokerTypedValue;
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.Instance;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;

-public class ConversionUtilsTest {
+class ConversionUtilsTest {

 	@BeforeEach
-	public void setUp() throws Exception {}
+	void setUp() throws Exception {
+	}

 	@Test
-	public void testAllResultPids() {
+	void testAllResultPids() {
 		final Qualifier qf = new Qualifier();
 		qf.setClassid("test");
 		qf.setClassname("test");
@ -95,42 +91,4 @@ public class ConversionUtilsTest {
 		assertEquals(6, list.size());
 	}

-	public void testOafResultToBrokerResult() {
-
-		final Author a1 = createAuthor("Michele Artini", "0000-0002-4406-428X");
-		final Author a2 = createAuthor("Claudio Atzori", "http://orcid.org/0000-0001-9613-6639");
-		final Author a3 = createAuthor("Alessia Bardi", null);
-
-		final Result r = new Result();
-		r.setAuthor(Arrays.asList(a1, a2, a3));
-
-		final OaBrokerMainEntity br = ConversionUtils.oafResultToBrokerResult(r);
-
-		assertEquals(3, br.getCreators().size());
-		assertEquals("0000-0002-4406-428X", br.getCreators().get(0).getOrcid());
-		assertEquals("0000-0001-9613-6639", br.getCreators().get(1).getOrcid());
-		assertNull(br.getCreators().get(2).getOrcid());
-	}
-
-	private Author createAuthor(final String name, final String orcid) {
-
-		final Author a = new Author();
-		a.setFullname("Michele Artini");
-
-		if (orcid != null) {
-			final Qualifier q = new Qualifier();
-			q.setClassid(ModelConstants.ORCID);
-			q.setClassname(ModelConstants.ORCID);
-			q.setSchemeid("dnet:pids");
-			q.setSchemename("dnet:pids");
-
-			final StructuredProperty pid = new StructuredProperty();
-			pid.setQualifier(q);
-			pid.setValue(orcid);
-
-			a.setPid(Arrays.asList(pid));
-		}
-		return a;
-	}
-
 }
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@ -53,24 +53,10 @@
            <artifactId>dhp-pace-core</artifactId>
            <version>${project.version}</version>
        </dependency>
-
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
-
-        <dependency>
-            <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
-            <version>1.0.2</version>
-        </dependency>
-
-        <dependency>
-            <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
-            <version>2.11.0</version>
-        </dependency>
-
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
@ -79,16 +65,10 @@
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
-
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-graphx_${scala.binary.version}</artifactId>
        </dependency>
-
-        <dependency>
-            <groupId>com.arakelian</groupId>
-            <artifactId>java-jq</artifactId>
-        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
            <artifactId>dom4j</artifactId>
@ -101,10 +81,6 @@
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple3;
 import scala.collection.JavaConversions;

@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			Dataset<Row> pivotHistory = spark
 				.createDataset(
 					Collections.emptyList(),
-					RowEncoder
-						.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
+					SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));

 			if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
 				pivotHistory = spark
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.ReduceFunction;
 import org.apache.spark.sql.*;
-import org.apache.spark.sql.catalyst.encoders.RowEncoder;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple2;
 import scala.Tuple3;

@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 		StructType idsSchema = StructType
 			.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");

-		Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
+		Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));

 		for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
 			String entityPath = graphBasePath + '/' + entityType.name();
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/config-default.xml
@ -15,12 +15,4 @@
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
-    <property>
-        <name>hiveMetastoreUris</name>
-        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
-    </property>
-    <property>
-        <name>pivotHistoryDatabase</name>
-        <value>&#x200B;</value>
-    </property>
 </configuration>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml
@ -198,8 +198,6 @@
            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
            <arg>--actionSetId</arg><arg>${actionSetId}</arg>
            <arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
-            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
        </spark>
        <ok to="PrepareOrgRels"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json
@ -49,7 +49,7 @@
          },
          {
            "field": "country",
-            "comparator": "countryMatch",
+            "comparator": "exactMatch",
            "weight": 1,
            "countIfUndefined": "true",
            "params": {}
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/irish_funder.json
@ -73,6 +73,12 @@
    "name": "Irish Nephrology Society",
    "synonym": []
  },
+  {
+    "id": "100011062",
+    "uri": "http://dx.doi.org/10.13039/100011062",
+    "name": "Asian Spinal Cord Network",
+    "synonym": []
+  },
  {
    "id": "100011096",
    "uri": "http://dx.doi.org/10.13039/100011096",
@ -217,6 +223,12 @@
    "name": "Global Brain Health Institute",
    "synonym": []
  },
+  {
+    "id": "100015776",
+    "uri": "http://dx.doi.org/10.13039/100015776",
+    "name": "Health and Social Care Board",
+    "synonym": []
+  },
  {
    "id": "100015992",
    "uri": "http://dx.doi.org/10.13039/100015992",
@ -391,6 +403,18 @@
    "name": "Irish Hospice Foundation",
    "synonym": []
  },
+  {
+    "id": "501100001596",
+    "uri": "http://dx.doi.org/10.13039/501100001596",
+    "name": "Irish Research Council for Science, Engineering and Technology",
+    "synonym": []
+  },
+  {
+    "id": "501100001597",
+    "uri": "http://dx.doi.org/10.13039/501100001597",
+    "name": "Irish Research Council for the Humanities and Social Sciences",
+    "synonym": []
+  },
  {
    "id": "501100001598",
    "uri": "http://dx.doi.org/10.13039/501100001598",
@ -491,7 +515,7 @@
    "id": "501100002081",
    "uri": "http://dx.doi.org/10.13039/501100002081",
    "name": "Irish Research Council",
-    "synonym": ["501100001596", "501100001597"]
+    "synonym": []
  },
  {
    "id": "501100002736",
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -560,15 +560,7 @@ case object Crossref2Oaf {
                "10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
                "10.13039/501100013589" | "10.13039/501100000271" =>
              generateSimpleRelationFromAward(funder, "ukri________", a => a)
-            //HFRI
-            case "10.13039/501100013209" =>
-              generateSimpleRelationFromAward(funder, "hfri________", a => a)
-              val targetId = getProjectId("hfri________", "1e5e62235d094afd01cd56e65112fc63")
-              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
-              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
-            //ERASMUS+
-            case "10.13039/501100010790" =>
-              generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
+
            case _ => logger.debug("no match for " + funder.DOI.get)

          }
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java
@ -53,8 +53,6 @@ public class Constraints implements Serializable {

 		for (Constraint sc : constraint) {
 			boolean verified = false;
-			if (!param.containsKey(sc.getField()))
-				return false;
 			for (String value : param.get(sc.getField())) {
 				if (sc.verifyCriteria(value.trim())) {
 					verified = true;
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
@ -14,7 +14,6 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
@ -85,12 +84,11 @@ public class SparkCountryPropagationJob {
 		Dataset<R> res = readPath(spark, sourcePath, resultClazz);

 		log.info("Reading prepared info: {}", preparedInfoPath);
-		final Dataset<Row> preparedInfoRaw = spark
+		Dataset<ResultCountrySet> prepared = spark
 			.read()
-			.json(preparedInfoPath);
+			.json(preparedInfoPath)
+			.as(Encoders.bean(ResultCountrySet.class));

-		if (!preparedInfoRaw.isEmpty()) {
-			final Dataset<ResultCountrySet> prepared = preparedInfoRaw.as(Encoders.bean(ResultCountrySet.class));
 		res
 			.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
 			.map(getCountryMergeFn(), Encoders.bean(resultClazz))
@ -98,13 +96,7 @@ public class SparkCountryPropagationJob {
 			.option("compression", "gzip")
 			.mode(SaveMode.Overwrite)
 			.json(outputPath);
-		} else {
-			res
-				.write()
-				.option("compression", "gzip")
-				.mode(SaveMode.Overwrite)
-				.json(outputPath);
-		}
+
 	}

 	private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf_remove.xml
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf_remove.xml
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromproject/sample/dataset/dataset
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromproject/sample/dataset/dataset
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromproject/sample/otherresearchproduct/otherresearchproduct
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromproject/sample/otherresearchproduct/otherresearchproduct
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromproject/sample/otherresearchproduct~HEAD
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromproject/sample/otherresearchproduct~HEAD
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromproject/sample/publication/publication
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromproject/sample/publication/publication
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromproject/sample/software/software
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromproject/sample/software/software
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java
@ -147,7 +147,6 @@ public class CleanGraphSparkJob {
 			.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
 			.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
 			.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
-			.map((MapFunction<T, T>) GraphCleaningFunctions::dedicatedUglyHacks, Encoders.bean(clazz))
 			.filter((FilterFunction<T>) GraphCleaningFunctions::filter);

 		// read the master-duplicate tuples
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hostedbymap/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hostedbymap/oozie_app/workflow.xml
@ -223,13 +223,11 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
@ -255,13 +253,11 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
@ -282,7 +278,6 @@
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOrganizations.sql
@ -25,8 +25,7 @@ SELECT
 		o.country || '@@@dnet:countries'                          AS country,
 		array[]::text[]                                           AS alternativenames,
 		'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
-		 array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid,
-        'Unknown'                                                 AS typology
+		 array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid
 FROM dsm_organizations o
 	LEFT OUTER JOIN dsm_services d ON (d.id = o.collectedfrom)
 	LEFT OUTER JOIN dsm_organizationpids p ON (p.organization = o.id)
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
@ -0,0 +1,5 @@
+[
+  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": false},
+  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",           "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
@ -0,0 +1,166 @@
+{
+  "cites":{
+    "original":"Cites",
+    "inverse":"IsCitedBy"
+  },
+  "compiles":{
+    "original":"Compiles",
+    "inverse":"IsCompiledBy"
+  },
+  "continues":{
+    "original":"Continues",
+    "inverse":"IsContinuedBy"
+  },
+  "derives":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "describes":{
+    "original":"Describes",
+    "inverse":"IsDescribedBy"
+  },
+  "documents":{
+    "original":"Documents",
+    "inverse":"IsDocumentedBy"
+  },
+  "hasmetadata":{
+    "original":"HasMetadata",
+    "inverse":"IsMetadataOf"
+  },
+  "hasassociationwith":{
+    "original":"HasAssociationWith",
+    "inverse":"HasAssociationWith"
+  },
+  "haspart":{
+    "original":"HasPart",
+    "inverse":"IsPartOf"
+  },
+  "hasversion":{
+    "original":"HasVersion",
+    "inverse":"IsVersionOf"
+  },
+  "iscitedby":{
+    "original":"IsCitedBy",
+    "inverse":"Cites"
+  },
+  "iscompiledby":{
+    "original":"IsCompiledBy",
+    "inverse":"Compiles"
+  },
+  "iscontinuedby":{
+    "original":"IsContinuedBy",
+    "inverse":"Continues"
+  },
+  "isderivedfrom":{
+    "original":"IsDerivedFrom",
+    "inverse":"IsSourceOf"
+  },
+  "isdescribedby":{
+    "original":"IsDescribedBy",
+    "inverse":"Describes"
+  },
+  "isdocumentedby":{
+    "original":"IsDocumentedBy",
+    "inverse":"Documents"
+  },
+  "isidenticalto":{
+    "original":"IsIdenticalTo",
+    "inverse":"IsIdenticalTo"
+  },
+  "ismetadatafor":{
+    "original":"IsMetadataFor",
+    "inverse":"IsMetadataOf"
+  },
+  "ismetadataof":{
+    "original":"IsMetadataOf",
+    "inverse":"IsMetadataFor"
+  },
+  "isnewversionof":{
+    "original":"IsNewVersionOf",
+    "inverse":"IsPreviousVersionOf"
+  },
+  "isobsoletedby":{
+    "original":"IsObsoletedBy",
+    "inverse":"Obsoletes"
+  },
+  "isoriginalformof":{
+    "original":"IsOriginalFormOf",
+    "inverse":"IsVariantFormOf"
+  },
+  "ispartof":{
+    "original":"IsPartOf",
+    "inverse":"HasPart"
+  },
+  "ispreviousversionof":{
+    "original":"IsPreviousVersionOf",
+    "inverse":"IsNewVersionOf"
+  },
+  "isreferencedby":{
+    "original":"IsReferencedBy",
+    "inverse":"References"
+  },
+  "isrelatedto":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "isrequiredby":{
+    "original":"IsRequiredBy",
+    "inverse":"Requires"
+  },
+  "isreviewedby":{
+    "original":"IsReviewedBy",
+    "inverse":"Reviews"
+  },
+  "issourceof":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "issupplementedby":{
+    "original":"IsSupplementedBy",
+    "inverse":"IsSupplementTo"
+  },
+  "issupplementto":{
+    "original":"IsSupplementTo",
+    "inverse":"IsSupplementedBy"
+  },
+  "isvariantformof":{
+    "original":"IsVariantFormOf",
+    "inverse":"IsOriginalFormOf"
+  },
+  "isversionof":{
+    "original":"IsVersionOf",
+    "inverse":"HasVersion"
+  },
+  "obsoletes":{
+    "original":"Obsoletes",
+    "inverse":"IsObsoletedBy"
+  },
+  "references":{
+    "original":"References",
+    "inverse":"IsReferencedBy"
+  },
+  "requires":{
+    "original":"Requires",
+    "inverse":"IsRequiredBy"
+  },
+  "related":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "reviews":{
+    "original":"Reviews",
+    "inverse":"IsReviewedBy"
+  },
+  "unknown":{
+    "original":"Unknown",
+    "inverse":"Unknown"
+  },
+  "isamongtopnsimilardocuments": {
+    "original": "IsAmongTopNSimilarDocuments",
+    "inverse": "HasAmongTopNSimilarDocuments"
+  },
+  "hasamongtopnsimilardocuments": {
+    "original": "HasAmongTopNSimilarDocuments",
+    "inverse": "IsAmongTopNSimilarDocuments"
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
@ -0,0 +1,258 @@
+package eu.dnetlib.dhp.sx.graph
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
+import eu.dnetlib.dhp.schema.sx.scholix.{
+  Scholix,
+  ScholixCollectedFrom,
+  ScholixEntityId,
+  ScholixIdentifier,
+  ScholixRelationship,
+  ScholixResource
+}
+import org.json4s
+import org.json4s.DefaultFormats
+import org.json4s.jackson.JsonMethods.parse
+
+import scala.collection.JavaConverters._
+import scala.io.Source
+
+case class RelationInfo(
+  source: String,
+  target: String,
+  relclass: String,
+  id: String,
+  collectedfrom: Seq[RelKeyValue]
+) {}
+case class RelKeyValue(key: String, value: String) {}
+
+object ScholexplorerUtils {
+
+  val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
+  val mapper = new ObjectMapper()
+
+  case class RelationVocabulary(original: String, inverse: String) {}
+
+  val relations: Map[String, RelationVocabulary] = {
+    val input = Source
+      .fromInputStream(
+        getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
+      )
+      .mkString
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+
+    lazy val json: json4s.JValue = parse(input)
+
+    json.extract[Map[String, RelationVocabulary]]
+  }
+
+  def invRel(rel: String): String = {
+    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+    if (semanticRelation != null)
+      semanticRelation.inverse
+    else
+      null
+  }
+
+  def generateDatasourceOpenAIREURLS(id: String): String = {
+    if (id != null && id.length > 12)
+      s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
+    else
+      null
+  }
+
+  def findURLForPID(
+    pidValue: List[StructuredProperty],
+    urls: List[String]
+  ): List[(StructuredProperty, String)] = {
+    pidValue.map { p =>
+      val pv = p.getValue
+
+      val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
+      (p, r.orNull)
+    }
+  }
+
+  def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
+    if (r.getInstance() == null || r.getInstance().isEmpty)
+      return List()
+    r.getInstance()
+      .asScala
+      .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
+      .filter(i => i.getPid != null && i.getUrl != null)
+      .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
+      .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
+      .distinct
+      .toList
+  }
+
+  def generateScholixResourceFromResult(result: Result): ScholixResource = {
+
+    if (result.getInstance() == null || result.getInstance().size() == 0)
+      return null
+
+    if (result.getPid == null || result.getPid.isEmpty)
+      return null
+
+    val r = new ScholixResource
+    r.setDnetIdentifier(result.getId)
+
+    val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
+    if (persistentIdentifiers.isEmpty)
+      return null
+
+    r.setIdentifier(persistentIdentifiers.asJava)
+
+    r.setObjectType(result.getResulttype.getClassid)
+
+    r.setObjectSubType(
+      result
+        .getInstance()
+        .asScala
+        .filter(i => i != null && i.getInstancetype != null)
+        .map(i => i.getInstancetype.getClassname)
+        .distinct
+        .head
+    )
+
+    if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
+      val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
+      if (titles.nonEmpty)
+        r.setTitle(titles.head)
+      else
+        return null
+    }
+    if (result.getAuthor != null && !result.getAuthor.isEmpty) {
+      val authors: List[ScholixEntityId] =
+        result.getAuthor.asScala
+          .map(a => {
+            val entity = new ScholixEntityId()
+            entity.setName(a.getFullname)
+            if (a.getPid != null && a.getPid.size() > 0)
+              entity.setIdentifiers(
+                a.getPid.asScala
+                  .map(sp => {
+                    val id = new ScholixIdentifier()
+                    id.setIdentifier(sp.getValue)
+                    id.setSchema(sp.getQualifier.getClassid)
+                    id
+                  })
+                  .take(3)
+                  .toList
+                  .asJava
+              )
+            entity
+          })
+          .toList
+      if (authors.nonEmpty)
+        r.setCreator(authors.asJava)
+
+    }
+
+    val dt: List[String] = result
+      .getInstance()
+      .asScala
+      .filter(i => i.getDateofacceptance != null)
+      .map(i => i.getDateofacceptance.getValue)
+      .toList
+    if (dt.nonEmpty)
+      r.setPublicationDate(dt.distinct.head)
+
+    r.setPublisher(
+      result
+        .getInstance()
+        .asScala
+        .map(i => i.getHostedby)
+        .filter(h => !"unknown".equalsIgnoreCase(h.getValue))
+        .map(h => {
+          val eid = new ScholixEntityId()
+          eid.setName(h.getValue)
+          val id = new ScholixIdentifier()
+          id.setIdentifier(h.getKey)
+          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+          id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
+          eid.setIdentifiers(List(id).asJava)
+          eid
+        })
+        .distinct
+        .asJava
+    )
+
+    r.setCollectedFrom(
+      result.getCollectedfrom.asScala
+        .map(cf => {
+          val scf = new ScholixCollectedFrom()
+          scf.setProvisionMode("collected")
+          scf.setCompletionStatus("complete")
+          val eid = new ScholixEntityId()
+          eid.setName(cf.getValue)
+          val id = new ScholixIdentifier()
+          id.setIdentifier(cf.getKey)
+          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+          id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
+          eid.setIdentifiers(List(id).asJava)
+          scf.setProvider(eid)
+          scf
+        })
+        .asJava
+    )
+
+    r
+  }
+
+  def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
+    val s: Scholix = new Scholix
+    s.setSource(source)
+    if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
+      s.setLinkprovider(
+        relation.collectedfrom
+          .map(cf => {
+            val eid = new ScholixEntityId()
+            eid.setName(cf.value)
+            val id = new ScholixIdentifier()
+            id.setIdentifier(cf.key)
+            id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+            id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
+            eid.setIdentifiers(List(id).asJava)
+            eid
+          })
+          .toList
+          .asJava
+      )
+    else {
+      val eid = new ScholixEntityId()
+      eid.setName("OpenAIRE")
+      val id = new ScholixIdentifier()
+      id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
+      id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+      id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
+      eid.setIdentifiers(List(id).asJava)
+      s.setLinkprovider(List(eid).asJava)
+    }
+    s.setIdentifier(relation.id)
+    val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
+    if (semanticRelation == null)
+      return null
+    s.setRelationship(
+      new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
+    )
+    s.setPublicationDate(source.getPublicationDate)
+    s.setPublisher(source.getPublisher)
+    val mockTarget = new ScholixResource
+    mockTarget.setDnetIdentifier(relation.target)
+    s.setTarget(mockTarget)
+    s
+  }
+
+  def updateTarget(s: Scholix, t: ScholixResource): String = {
+
+    s.setTarget(t)
+    val spublishers: Seq[ScholixEntityId] =
+      if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
+    val tpublishers: Seq[ScholixEntityId] =
+      if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
+    val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
+    s.setPublisher(mergedPublishers.asJava)
+    mapper.writeValueAsString(s)
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@ -0,0 +1,141 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.application.AbstractScalaApplication
+import eu.dnetlib.dhp.schema.oaf.{
+  KeyValue,
+  OtherResearchProduct,
+  Publication,
+  Relation,
+  Result,
+  Software,
+  Dataset => OafDataset
+}
+import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
+import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}
+
+class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
+    extends AbstractScalaApplication(propertyPath, args, log: Logger) {
+
+  /** Here all the spark applications runs this method
+    * where the whole logic of the spark node is defined
+    */
+  override def run(): Unit = {
+    val sourcePath = parser.get("sourcePath")
+    log.info("sourcePath: {}", sourcePath)
+    val targetPath = parser.get("targetPath")
+    log.info("targetPath: {}", targetPath)
+    generateBidirectionalRelations(sourcePath, targetPath, spark)
+    generateScholixResource(sourcePath, targetPath, spark)
+    generateScholix(targetPath, spark)
+  }
+
+  def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
+    val entityMap: Map[String, StructType] = Map(
+      "publication"          -> Encoders.bean(classOf[Publication]).schema,
+      "dataset"              -> Encoders.bean(classOf[OafDataset]).schema,
+      "software"             -> Encoders.bean(classOf[Software]).schema,
+      "otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
+    )
+
+    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+    implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
+
+    val resDs = spark.emptyDataset[ScholixResource]
+    val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
+      println(s"adding ${item._1}")
+      res.union(
+        spark.read
+          .schema(item._2)
+          .json(s"$inputPath/${item._1}")
+          .as[Result]
+          .map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
+          .filter(s => s != null)
+      )
+    })
+    scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
+  }
+
+  def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
+    val relSchema = Encoders.bean(classOf[Relation]).schema
+
+    val relDF = spark.read
+      .schema(relSchema)
+      .json(s"$inputPath/relation")
+      .where(
+        "datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
+        "and relClass <> 'merges' and relClass <> 'isMergedIn'"
+      )
+      .select("source", "target", "collectedfrom", "relClass")
+
+    def invRel: String => String = { s =>
+      ScholexplorerUtils.invRel(s)
+    }
+
+    import org.apache.spark.sql.functions.udf
+    val inverseRelationUDF = udf(invRel)
+    val inverseRelation = relDF.select(
+      col("target").alias("source"),
+      col("source").alias("target"),
+      col("collectedfrom"),
+      inverseRelationUDF(col("relClass")).alias("relClass")
+    )
+
+    val bidRel = inverseRelation
+      .union(relDF)
+      .withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
+      .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
+      .drop("collectedfrom")
+      .withColumnRenamed("cf", "collectedfrom")
+      .groupBy(col("id"))
+      .agg(
+        first("source").alias("source"),
+        first("target").alias("target"),
+        first("relClass").alias("relClass"),
+        first("collectedfrom").alias("collectedfrom")
+      )
+
+    bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
+
+  }
+
+  def generateScholix(outputPath: String, spark: SparkSession): Unit = {
+    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+    implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
+
+    import spark.implicits._
+    val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
+    val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
+
+    val scholix_one_verse = relations
+      .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
+      .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
+      .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
+
+    val resourceTarget = relations
+      .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
+      .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
+
+    scholix_one_verse
+      .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
+      .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
+      .write
+      .mode(SaveMode.Overwrite)
+      .option("compression", "gzip")
+      .text(s"$outputPath/scholix")
+  }
+}
+
+object SparkCreateScholexplorerDump {
+  val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
+
+  def main(args: Array[String]): Unit = {
+    new SparkCreateScholexplorerDump(
+      log = logger,
+      args = args,
+      propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
+    ).initialize().run()
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@ -0,0 +1,26 @@
+package eu.dnetlib.dhp.sx.graph.scholix
+
+import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
+import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
+import org.junit.jupiter.api.Test
+import org.objenesis.strategy.StdInstantiatorStrategy
+
+class ScholixGenerationTest {
+
+  @Test
+  def generateScholix(): Unit = {
+
+    val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
+    val app = new SparkCreateScholexplorerDump(null, null, null)
+//   app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
+//    app.generateBidirectionalRelations(
+//      "/home/sandro/Downloads/scholix_sample/",
+//      "/home/sandro/Downloads/scholix/",
+//      spark
+//    )
+    app.generateScholix("/home/sandro/Downloads/scholix/", spark)
+
+  }
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
@ -128,14 +128,12 @@ public class SolrAdminApplication implements Closeable {

 	public SolrResponse deleteAlias(String aliasName) throws SolrServerException, IOException {
 		CollectionAdminRequest.DeleteAlias deleteAliasRequest = CollectionAdminRequest.deleteAlias(aliasName);
-		log.info("deleting alias: {}", aliasName);
 		return deleteAliasRequest.process(solrClient);
 	}

 	public SolrResponse createAlias(String aliasName, String collection) throws IOException, SolrServerException {
 		CollectionAdminRequest.CreateAlias createAliasRequest = CollectionAdminRequest
 			.createAlias(aliasName, collection);
-		log.info("creating alias: {} for collection: {}", aliasName, collection);
 		return createAliasRequest.process(solrClient);
 	}

--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java
@ -30,6 +30,7 @@ import eu.dnetlib.dhp.schema.solr.Context;
 import eu.dnetlib.dhp.schema.solr.Country;
 import eu.dnetlib.dhp.schema.solr.Datasource;
 import eu.dnetlib.dhp.schema.solr.EoscIfGuidelines;
+import eu.dnetlib.dhp.schema.solr.ExternalReference;
 import eu.dnetlib.dhp.schema.solr.Instance;
 import eu.dnetlib.dhp.schema.solr.Journal;
 import eu.dnetlib.dhp.schema.solr.Measure;
@ -375,7 +376,7 @@ public class ProvisionModelSupport {
 		rs.setIsInDiamondJournal(r.getIsInDiamondJournal());
 		rs.setPubliclyFunded(r.getPubliclyFunded());
 		rs.setTransformativeAgreement(r.getTransformativeAgreement());
-
+		rs.setExternalReference(mapExternalReference(r.getExternalReference()));
 		rs.setInstance(mapInstances(r.getInstance()));

 		if (r instanceof Publication) {
@ -561,6 +562,27 @@ public class ProvisionModelSupport {
 			.orElse(null);
 	}

+	private static List<ExternalReference> mapExternalReference(
+		List<eu.dnetlib.dhp.schema.oaf.ExternalReference> externalReference) {
+		return Optional
+			.ofNullable(externalReference)
+			.map(
+				ext -> ext
+					.stream()
+					.map(
+						e -> ExternalReference
+							.newInstance(
+								e.getSitename(),
+								e.getLabel(),
+								e.getAlternateLabel(),
+								e.getUrl(),
+								mapCodeLabel(e.getQualifier()),
+								e.getRefidentifier(),
+								e.getQuery()))
+					.collect(Collectors.toList()))
+			.orElse(Lists.newArrayList());
+	}
+
 	private static List<Context> asContext(List<eu.dnetlib.dhp.schema.oaf.Context> ctxList,
 		ContextMapper contextMapper) {

@ -579,7 +601,7 @@ public class ProvisionModelSupport {
 		}

 		return Optional
-			.ofNullable(contexts)
+			.of(contexts)
 			.map(
 				ctx -> ctx
 					.stream()
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java
@ -213,7 +213,7 @@ public class StreamingInputDocumentFactory implements Serializable {
 			}
 			writer.close();
 			indexDocument.addField(INDEX_RESULT, results.toString());
-			// indexDocument.addField(INDEX_JSON_RESULT, json);
+			indexDocument.addField(INDEX_JSON_RESULT, json);
 		} finally {
 			outputFactory.remove();
 			eventFactory.remove();
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/oaipmh/oozie_app/workflow.xml
@ -69,7 +69,7 @@
        </configuration>
    </global>

-    <start to="irish_oaiphm_provision"/>
+    <start to="oaiphm_provision"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java
@ -67,7 +67,7 @@ public class PrepareRelationsJobTest {
 	@Test
 	void testRunPrepareRelationsJob(@TempDir Path testPath) throws Exception {

-		final int maxRelations = 5;
+		final int maxRelations = 20;
 		PrepareRelationsJob
 			.main(
 				new String[] {
@ -86,7 +86,7 @@ public class PrepareRelationsJobTest {
 			.as(Encoders.bean(Relation.class))
 			.cache();

-		assertEquals(44, out.count());
+		assertEquals(maxRelations, out.count());

 		Dataset<Row> freq = out
 			.toDF()
@ -101,8 +101,12 @@ public class PrepareRelationsJobTest {
 		long affiliation = getRows(freq, AFFILIATION).get(0).getAs("count");

 		assertEquals(outcome, participation);
-		assertEquals(outcome, affiliation);
-		assertEquals(4, affiliation);
+		assertTrue(outcome > affiliation);
+		assertTrue(participation > affiliation);
+
+		assertEquals(7, outcome);
+		assertEquals(7, participation);
+		assertEquals(6, affiliation);
 	}

 	protected List<Row> getRows(Dataset<Row> freq, String col) {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Miriam Baglioni	a2b708bb71	[AffiliationIngestion]refactoring	2024-06-29 18:36:47 +02:00
Miriam Baglioni	9cbe966b4a	[AffiliationIngestion]refactoring	2024-06-29 18:35:49 +02:00
Miriam Baglioni	236b64d830	[AffiliationIngestion]Extended the ingestion of affiliation from open aire to include also links derived from Web Crawl. Extended the test. Inserted in Constatns the id and name of the webcrawl datasource to be used here and also in the ingestion of links from web crawl	2024-06-29 18:29:20 +02:00
Claudio Atzori	14539f9c8b	[graph provision] publicFormat worfklow parameter defined as optional	2024-06-28 14:55:18 +02:00
Claudio Atzori	1bc8c5d173	[graph provision] fixed serialization of the instancetypes	2024-06-28 14:54:28 +02:00
Claudio Atzori	1ccf01cdb8	Using the updated Solr JSON payload model classes	2024-06-28 12:38:07 +02:00
Claudio Atzori	b79cb155ba	Merge pull request 'Fix permissions-issue in Stats-workflow, step22a-createPDFsAggregated.' (#450 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#450	2024-06-26 10:11:34 +02:00
Claudio Atzori	33a02c5b9e	Merge pull request 'Change the selection criteria for the pivot record of a group so that by best pid type becomes the first criteria. This will have the effect to converge to records having DOI pid' (#446 ) from pivotselectionbypid into beta Reviewed-on: D-Net/dnet-hadoop#446	2024-06-26 10:10:13 +02:00
Claudio Atzori	1182bca9eb	Merge pull request 'Add support to cretate/update solr collection aliases' (#449 ) from 9872-create-solr-collection-aliases into beta Reviewed-on: D-Net/dnet-hadoop#449	2024-06-26 10:09:51 +02:00
Claudio Atzori	1c30eacac2	updated index feeding procedure to exploit the collection aliases	2024-06-25 15:27:38 +02:00
Claudio Atzori	6055212f77	merged from the json_payload branch	2024-06-25 12:39:02 +02:00
Claudio Atzori	0031cf849e	Merge branch 'beta' into 9872-create-solr-collection-aliases	2024-06-25 09:58:01 +02:00
Serafeim Chatzopoulos	9f6e16a03c	Add support to cretate/update solr collection aliases	2024-06-20 16:03:15 +03:00
Lampros Smyrnaios	66cd28f70a	- Fix not using the "export HADOOP_USER_NAME" statement in "createPDFsAggregated.sh", which caused permission-issues when creating tables with Impala. - Remove unused "--user" parameter in "impala-shell" calls. - Code polishing.	2024-06-20 14:33:46 +03:00
Lampros Smyrnaios	c6b1ab2a18	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-06-20 14:33:05 +03:00
Miriam Baglioni	d35edac212	[IrishFunderList]make changed according to 9635 comment 20, 21, 22 and 23	2024-06-20 12:28:28 +02:00
Miriam Baglioni	6421f8fece	Merge remote-tracking branch 'origin/beta' into beta	2024-06-19 11:12:15 +02:00
Miriam Baglioni	ac270f795b	[IrishFunderList]make changed according to 9635 comment 14, 15 and 16	2024-06-19 11:11:52 +02:00
Lampros Smyrnaios	236aed8954	Merge remote-tracking branch 'origin/beta' into beta	2024-06-18 17:12:35 +03:00
Claudio Atzori	dd541f8cf5	Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster.' (#447 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#447	2024-06-18 15:52:30 +02:00
Lampros Smyrnaios	ff335578ea	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-06-18 14:52:31 +03:00
Lampros Smyrnaios	285416c74e	Merge branch 'beta' into beta	2024-06-18 13:50:38 +02:00
Lampros Smyrnaios	3095047e5e	Miscellaneous updates to the copying operation to Impala Cluster: - Fix not breaking out of the VIEWS-infinite-loop when the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" is set to "false". - Exit the script when no HDFS-active-node was found, independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR". - Fix view_name-recognition in a log-message, by using the more advanced "Perl-Compatible Regular Expressions" in "grep". - Add error-handling for "compute stats" errors.	2024-06-18 14:40:41 +03:00
Antonis Lempesis	0456f1b788	Merge remote-tracking branch 'origin/beta' into beta	2024-06-14 15:11:30 +03:00
Antonis Lempesis	38636942c7	filtering out deletedbyinference and invinsible results from accessroute	2024-06-14 15:11:19 +03:00
Lampros Smyrnaios	d942a1101b	Miscellaneous updates to the copying operation to Impala Cluster: - Show some counts and the elapsed time for various sub-tasks. - Code polishing.	2024-06-14 12:14:38 +03:00
Giambattista Bloisi	9bf2bda1c6	Fix: next returned a null value at end of stream	2024-06-12 13:28:51 +02:00
Giambattista Bloisi	d90cb099b8	Fix for paginationStart parameter management	2024-06-11 20:23:44 +02:00
Giambattista Bloisi	4f2a61e10f	Change the selection criteria for the pivot record of a group so that by best pid type becomes the first criteria. This will have the effect to slowly converge to records having DOI pid	2024-06-11 15:33:56 +02:00
Claudio Atzori	11fe3a4fe0	[graph resolution] use sparkExecutorMemory to define also the memoryOverhead	2024-06-11 14:21:17 +02:00
Claudio Atzori	a8d68c9d29	avoid NPEs	2024-06-11 14:19:24 +02:00
Miriam Baglioni	8fe934810f	Merge remote-tracking branch 'origin/beta' into beta	2024-06-11 10:28:51 +02:00
Miriam Baglioni	9da006e98c	[SDGFoSActionSet]remove datainfo for the result. It is not needed (qualifier.classid = UPDATE) useless since subject do not go at the level of the instance	2024-06-11 10:28:32 +02:00
Giambattista Bloisi	85c1eae7e0	Fixes for pagination strategy looping at end of download	2024-06-10 19:03:58 +02:00
Claudio Atzori	b0eba210c0	[actionset promotion] use sparkExecutorMemory to define also the memoryOverhead	2024-06-10 16:15:24 +02:00
Claudio Atzori	3776327a8c	hostedby patching to work with the updated Crossref contents, resolved conflict	2024-06-10 15:24:12 +02:00
Claudio Atzori	0139f23d66	Merge pull request 'organization type from OpenOrgs' (#445 ) from import_openorg_type into beta Reviewed-on: D-Net/dnet-hadoop#445	2024-06-07 12:17:31 +02:00
Michele Artini	c726572418	changed some parameters in OSF test	2024-06-07 12:03:26 +02:00
Claudio Atzori	ec79405cc9	[graph raw] set organization type from openorgs	2024-06-07 11:30:31 +02:00
Miriam Baglioni	1477406ecc	[bulkTag] fixed issue that made project disappear in graph_10_enriched	2024-06-06 10:45:41 +02:00
Claudio Atzori	92c3abd5a4	[graph cleaning] use sparkExecutorMemory to define also the memoryOverhead	2024-06-06 10:44:33 +02:00
Claudio Atzori	ce2364743a	applying changes from PR#442: Fix for missing collectedfrom after dedup	2024-06-06 10:43:43 +02:00
Claudio Atzori	f70dc76b61	minor	2024-06-06 10:43:10 +02:00
Claudio Atzori	73bd1938a5	[graph2hive] use sparkExecutorMemory to define also the memoryOverhead	2024-06-05 12:17:35 +02:00
Claudio Atzori	da5c1e73a4	Merge pull request 'Irish oaipmh exporter' (#443 ) from irish-oaipmh-exporter into beta Reviewed-on: D-Net/dnet-hadoop#443	2024-06-05 10:55:09 +02:00
Claudio Atzori	a02f3f0d2b	code formatting	2024-05-30 10:21:18 +02:00
Alessia Bardi	eadfd8d71d	Merge pull request 'Updated XMLIterator for splitting on different nodes' (#436 ) from dblp_collection_plugin into beta Reviewed-on: D-Net/dnet-hadoop#436	2024-05-29 16:05:06 +02:00
Alessia Bardi	05ee783c07	Merge branch 'beta' into dblp_collection_plugin	2024-05-29 16:04:39 +02:00
Alessia Bardi	fe9fb59c90	Merge pull request 'Rest collector plugin on hadoop supports a new param to pass request headers' (#441 ) from rest-collector-request-header-map into beta Reviewed-on: D-Net/dnet-hadoop#441	2024-05-29 15:54:39 +02:00
Claudio Atzori	c272c4ad68	code formatting	2024-05-29 15:50:07 +02:00
Alessia Bardi	c5f4da16a4	Merge branch 'beta' into rest-collector-request-header-map	2024-05-29 15:46:23 +02:00
Alessia	1b165a14a0	Rest collector plugin on hadoop supports a new param to pass request headers	2024-05-29 15:41:36 +02:00
Michele Artini	e996787be2	OSF test	2024-05-29 15:05:17 +02:00
Claudio Atzori	62716141c5	Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster' (#440 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#440	2024-05-29 14:34:51 +02:00
Miriam Baglioni	5d85b70e1f	[NOAMI] removed Ireland funder id 501100011103. ticket 9635	2024-05-29 11:55:00 +02:00
Giambattista Bloisi	73316d8c83	Add jaxb and jaxws dependencies when compiling with spark-34 profile as they are required to run with jdk > 8	2024-05-28 14:14:51 +02:00
Miriam Baglioni	75d5ddb999	Update to include a blackList that filters out the results we know are wrongly associated to IE - update workflow definition - the blacklist parameter	2024-05-27 12:01:28 +02:00
Miriam Baglioni	87c9c61b41	Update to include a blackList that filters out the results we know are wrongly associated to IE - refactoring	2024-05-27 12:01:16 +02:00
Miriam Baglioni	b55fed09f8	Update to include a blackList that filters out the results we know are wrongly associated to IE	2024-05-27 12:01:01 +02:00
Claudio Atzori	107d958b89	[org dedup] avoid NPEs in SparkPrepareNewOrgs	2024-05-27 11:59:54 +02:00
Claudio Atzori	3a7a6ecc32	[org dedup] avoid NPEs in SparkPrepareOrgRels	2024-05-27 11:59:45 +02:00
Claudio Atzori	1af4224d3d	[org dedup] avoid NPEs in SparkPrepareOrgRels	2024-05-27 11:59:33 +02:00
Claudio Atzori	0d5bdb2db0	Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta	2024-05-27 11:59:02 +02:00
Claudio Atzori	66548e6a83	Merge pull request 'changes in copy script' (#438 ) from antonis.lempesis/dnet-hadoop:beta into beta Reviewed-on: D-Net/dnet-hadoop#438	2024-05-27 11:54:03 +02:00
Giambattista Bloisi	1b2357e10a	Merge pull request 'Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12' (#327 ) from spark34-integration into beta Reviewed-on: D-Net/dnet-hadoop#327	2024-05-23 09:20:28 +02:00
Sandro La Bruzzo	f1fe363b19	merged again from beta (I hope for the last time)	2024-05-22 11:08:52 +02:00
Sandro La Bruzzo	66c1ffc866	merged again from beta (I hope for the last time)	2024-05-22 11:02:46 +02:00
Sandro La Bruzzo	e8a61d5dd5	removed plugin, use only FileGZip plugin	2024-05-21 13:45:29 +02:00
Sandro La Bruzzo	ca9414b737	Implement multiple node name splitter on GZipCollectorPlugin and all nodes that use XMLIterator. If the splitter name contains is a comma separated values it splits for all the values	2024-05-21 09:11:13 +02:00
Sandro La Bruzzo	103e2652b3	merged beta	2024-05-17 14:43:07 +02:00
Sandro La Bruzzo	a87f9ea643	fixed scholexplorer bug	2024-05-17 14:16:43 +02:00
Sandro La Bruzzo	6efab4d88e	fixed scholexplorer bug	2024-05-16 16:19:18 +02:00
Sandro La Bruzzo	db358ad0d2	code formatted	2024-05-02 15:25:57 +02:00
Sandro La Bruzzo	26bf8e763a	merged from beta	2024-05-02 15:20:23 +02:00
Sandro La Bruzzo	a860c57bbc	updated .gitignore	2024-05-02 15:16:00 +02:00
Sandro La Bruzzo	0646d0d064	Updated main sparkApplication to avoid to require master variable	2024-05-02 15:15:03 +02:00
Sandro La Bruzzo	133ead1e3e	updated new version of scholexplorer Generation	2024-04-29 09:00:30 +02:00
Sandro La Bruzzo	052c6aac9d	formatted code	2024-04-26 16:03:04 +02:00
Sandro La Bruzzo	9cd3bc0f10	Added a new generation of the dump for scholexplorer tested with last version of spark, and strongly refactored	2024-04-26 16:02:07 +02:00
Sandro La Bruzzo	0d628cd62b	merged again from beta	2024-04-23 17:34:55 +02:00
Sandro La Bruzzo	073f320c6a	Added module containing all the dependencies, useful for spark deploy on k8.	2024-04-22 11:32:31 +02:00
Sandro La Bruzzo	b84ad0c06e	merged beta	2024-04-19 14:39:59 +02:00
Sandro La Bruzzo	8dd9cf84e2	code formatted	2024-04-19 12:30:59 +02:00
Sandro La Bruzzo	342cb6189b	fixed problem on changed signature on RowEncoder removed property dhp.schema.artifact	2024-04-19 12:13:26 +02:00
Giambattista Bloisi	613ec5ffce	Add profiles for different spark versions: spark-24, spark-34, spark-35	2023-12-05 19:11:06 +01:00
Sandro La Bruzzo	52495f2cd2	used javax.xml.stream.XMLEventReader instead of deprecated scala.xml.pull.XMLEventReader	2023-12-05 19:11:06 +01:00
Sandro La Bruzzo	8c3e9a09d3	added repository openaire-third-parties	2023-12-05 19:11:06 +01:00
Giambattista Bloisi	2fa78f6071	Changes requires to build and run tests with Java 17	2023-12-05 19:11:06 +01:00
Giambattista Bloisi	326c9dc08c	Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12	2023-12-05 19:11:06 +01:00