Compare commits

..

No commits in common. "main" and "WebCrowlBeta" have entirely different histories.

330 changed files with 8226 additions and 12955 deletions

1
.gitignore vendored
View File

@ -27,4 +27,3 @@ spark-warehouse
/**/.factorypath /**/.factorypath
/**/.scalafmt.conf /**/.scalafmt.conf
/.java-version /.java-version
/dhp-shade-package/dependency-reduced-pom.xml

View File

@ -80,15 +80,7 @@ class WritePredefinedProjectPropertiesTest {
mojo.outputFile = testFolder; mojo.outputFile = testFolder;
// execute // execute
try { Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
mojo.execute();
Assertions.assertTrue(false); // not reached
} catch (Exception e) {
Assertions
.assertTrue(
MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
IllegalArgumentException.class.isAssignableFrom(e.getClass()));
}
} }
@Test @Test

View File

@ -63,14 +63,15 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>edu.cmu</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>secondstring</artifactId> <artifactId>dhp-pace-core</artifactId>
</dependency> <version>${project.version}</version>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency> <dependency>
<groupId>com.github.sisyphsu</groupId> <groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId> <artifactId>dateparser</artifactId>
@ -160,7 +161,7 @@
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId> <artifactId>${dhp-schemas.artifact}</artifactId>
</dependency> </dependency>
<dependency> <dependency>
@ -169,23 +170,4 @@
</dependency> </dependency>
</dependencies> </dependencies>
<!-- dependencies required on JDK9+ because J2EE has been removed -->
<profiles>
<profile>
<id>spark-34</id>
<dependencies>
<dependency>
<groupId>javax.xml.bind</groupId>
<artifactId>jaxb-api</artifactId>
<version>2.2.11</version>
</dependency>
<dependency>
<groupId>com.sun.xml.ws</groupId>
<artifactId>jaxws-ri</artifactId>
<version>2.3.3</version>
<type>pom</type>
</dependency>
</dependencies>
</profile>
</profiles>
</project> </project>

View File

@ -7,12 +7,12 @@ import java.sql.*;
import java.util.function.Consumer; import java.util.function.Consumer;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.apache.commons.logging.Log;
import org.slf4j.LoggerFactory; import org.apache.commons.logging.LogFactory;
public class DbClient implements Closeable { public class DbClient implements Closeable {
private static final Logger log = LoggerFactory.getLogger(DbClient.class); private static final Log log = LogFactory.getLog(DbClient.class);
private final Connection connection; private final Connection connection;
@ -37,8 +37,6 @@ public class DbClient implements Closeable {
try (final Statement stmt = connection.createStatement()) { try (final Statement stmt = connection.createStatement()) {
stmt.setFetchSize(100); stmt.setFetchSize(100);
log.info("running SQL:\n\n{}\n\n", sql);
try (final ResultSet rs = stmt.executeQuery(sql)) { try (final ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) { while (rs.next()) {
consumer.accept(rs); consumer.accept(rs);

View File

@ -38,7 +38,7 @@ public class PacePerson {
PacePerson.class PacePerson.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt"))); "/eu/dnetlib/dhp/common/name_particles.txt")));
} catch (Exception e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }

View File

@ -0,0 +1,53 @@
package eu.dnetlib.dhp.common.api;
import java.io.IOException;
import java.io.InputStream;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.internal.Util;
import okio.BufferedSink;
import okio.Okio;
import okio.Source;
public class InputStreamRequestBody extends RequestBody {
private final InputStream inputStream;
private final MediaType mediaType;
private final long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
return new InputStreamRequestBody(inputStream, mediaType, len);
}
private InputStreamRequestBody(InputStream inputStream, MediaType mediaType, long len) {
this.inputStream = inputStream;
this.mediaType = mediaType;
this.lenght = len;
}
@Override
public MediaType contentType() {
return mediaType;
}
@Override
public long contentLength() {
return lenght;
}
@Override
public void writeTo(BufferedSink sink) throws IOException {
Source source = null;
try {
source = Okio.source(inputStream);
sink.writeAll(source);
} finally {
Util.closeQuietly(source);
}
}
}

View File

@ -0,0 +1,8 @@
package eu.dnetlib.dhp.common.api;
public class MissingConceptDoiException extends Throwable {
public MissingConceptDoiException(String message) {
super(message);
}
}

View File

@ -0,0 +1,365 @@
package eu.dnetlib.dhp.common.api;
import java.io.*;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHeaders;
import org.apache.http.entity.ContentType;
import org.jetbrains.annotations.NotNull;
import com.google.gson.Gson;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
import okhttp3.*;
public class ZenodoAPIClient implements Serializable {
String urlString;
String bucket;
String deposition_id;
String access_token;
public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");
private static final MediaType MEDIA_TYPE_ZIP = MediaType.parse("application/zip");
public String getUrlString() {
return urlString;
}
public void setUrlString(String urlString) {
this.urlString = urlString;
}
public String getBucket() {
return bucket;
}
public void setBucket(String bucket) {
this.bucket = bucket;
}
public void setDeposition_id(String deposition_id) {
this.deposition_id = deposition_id;
}
public ZenodoAPIClient(String urlString, String access_token) {
this.urlString = urlString;
this.access_token = access_token;
}
/**
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
*
* @return response code
* @throws IOException
*/
public int newDeposition() throws IOException {
String json = "{}";
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setRequestMethod("POST");
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
this.bucket = newSubmission.getLinks().getBucket();
this.deposition_id = newSubmission.getId();
return responseCode;
}
/**
* Upload files in Zenodo.
*
* @param is the inputStream for the file to upload
* @param file_name the name of the file as it will appear on Zenodo
* @return the response code
*/
public int uploadIS(InputStream is, String file_name) throws IOException {
URL url = new URL(bucket + "/" + file_name);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("PUT");
byte[] buf = new byte[8192];
int length;
try (OutputStream os = conn.getOutputStream()) {
while ((length = is.read(buf)) != -1) {
os.write(buf, 0, length);
}
}
int responseCode = conn.getResponseCode();
if (!checkOKStatus(responseCode)) {
throw new IOException("Unexpected code " + responseCode + getBody(conn));
}
return responseCode;
}
@NotNull
private String getBody(HttpURLConnection conn) throws IOException {
String body = "{}";
try (BufferedReader br = new BufferedReader(
new InputStreamReader(conn.getInputStream(), "utf-8"))) {
StringBuilder response = new StringBuilder();
String responseLine = null;
while ((responseLine = br.readLine()) != null) {
response.append(responseLine.trim());
}
body = response.toString();
}
return body;
}
/**
* Associates metadata information to the current deposition
*
* @param metadata the metadata
* @return response code
* @throws IOException
*/
public int sendMretadata(String metadata) throws IOException {
URL url = new URL(urlString + "/" + deposition_id);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("PUT");
try (OutputStream os = conn.getOutputStream()) {
byte[] input = metadata.getBytes("utf-8");
os.write(input, 0, input.length);
}
final int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + getBody(conn));
return responseCode;
}
private boolean checkOKStatus(int responseCode) {
if (HttpURLConnection.HTTP_OK != responseCode ||
HttpURLConnection.HTTP_CREATED != responseCode)
return true;
return false;
}
/**
* To publish the current deposition. It works for both new deposition or new version of an old deposition
*
* @return response code
* @throws IOException
*/
@Deprecated
public int publish() throws IOException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/publish")
.addHeader("Authorization", "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
}
}
/**
* To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
* for the new version.
*
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
* concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
setDepositionId(concept_rec_id, 1);
String json = "{}";
URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("POST");
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
String latest_draft = zenodoModel.getLinks().getLatest_draft();
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
bucket = getBucket(latest_draft);
return responseCode;
}
/**
* To finish uploading a version or new deposition not published
* It sets the deposition_id and the bucket to be used
*
*
* @param deposition_id the deposition id of the not yet published upload
* concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/
public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
this.deposition_id = deposition_id;
String json = "{}";
URL url = new URL(urlString + "/" + deposition_id);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setRequestMethod("POST");
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
bucket = zenodoModel.getLinks().getBucket();
return responseCode;
}
private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
ZenodoModelList zenodoModelList = new Gson()
.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);
for (ZenodoModel zm : zenodoModelList) {
if (zm.getConceptrecid().equals(concept_rec_id)) {
deposition_id = zm.getId();
return;
}
}
if (zenodoModelList.size() == 0)
throw new MissingConceptDoiException(
"The concept record id specified was missing in the list of depositions");
setDepositionId(concept_rec_id, page + 1);
}
private String getPrevDepositions(String page) throws IOException {
HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
urlBuilder.addQueryParameter("page", page);
URL url = new URL(urlBuilder.build().toString());
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("GET");
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
return body;
}
private String getBucket(String inputUurl) throws IOException {
URL url = new URL(inputUurl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("GET");
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if (!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
return zenodoModel.getLinks().getBucket();
}
}

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.common.api.zenodo;
public class Community {
private String identifier;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.common.api.zenodo;
public class Creator {
private String affiliation;
private String name;
private String orcid;
public String getAffiliation() {
return affiliation;
}
public void setAffiliation(String affiliation) {
this.affiliation = affiliation;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getOrcid() {
return orcid;
}
public void setOrcid(String orcid) {
this.orcid = orcid;
}
public static Creator newInstance(String name, String affiliation, String orcid) {
Creator c = new Creator();
if (name != null) {
c.name = name;
}
if (affiliation != null) {
c.affiliation = affiliation;
}
if (orcid != null) {
c.orcid = orcid;
}
return c;
}
}

View File

@ -0,0 +1,44 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class File implements Serializable {
private String checksum;
private String filename;
private long filesize;
private String id;
public String getChecksum() {
return checksum;
}
public void setChecksum(String checksum) {
this.checksum = checksum;
}
public String getFilename() {
return filename;
}
public void setFilename(String filename) {
this.filename = filename;
}
public long getFilesize() {
return filesize;
}
public void setFilesize(long filesize) {
this.filesize = filesize;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class Grant implements Serializable {
private String id;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public static Grant newInstance(String id) {
Grant g = new Grant();
g.id = id;
return g;
}
}

View File

@ -0,0 +1,92 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class Links implements Serializable {
private String bucket;
private String discard;
private String edit;
private String files;
private String html;
private String latest_draft;
private String latest_draft_html;
private String publish;
private String self;
public String getBucket() {
return bucket;
}
public void setBucket(String bucket) {
this.bucket = bucket;
}
public String getDiscard() {
return discard;
}
public void setDiscard(String discard) {
this.discard = discard;
}
public String getEdit() {
return edit;
}
public void setEdit(String edit) {
this.edit = edit;
}
public String getFiles() {
return files;
}
public void setFiles(String files) {
this.files = files;
}
public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getLatest_draft() {
return latest_draft;
}
public void setLatest_draft(String latest_draft) {
this.latest_draft = latest_draft;
}
public String getLatest_draft_html() {
return latest_draft_html;
}
public void setLatest_draft_html(String latest_draft_html) {
this.latest_draft_html = latest_draft_html;
}
public String getPublish() {
return publish;
}
public void setPublish(String publish) {
this.publish = publish;
}
public String getSelf() {
return self;
}
public void setSelf(String self) {
this.self = self;
}
}

View File

@ -0,0 +1,153 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import java.util.List;
public class Metadata implements Serializable {
private String access_right;
private List<Community> communities;
private List<Creator> creators;
private String description;
private String doi;
private List<Grant> grants;
private List<String> keywords;
private String language;
private String license;
private PrereserveDoi prereserve_doi;
private String publication_date;
private List<String> references;
private List<RelatedIdentifier> related_identifiers;
private String title;
private String upload_type;
private String version;
public String getUpload_type() {
return upload_type;
}
public void setUpload_type(String upload_type) {
this.upload_type = upload_type;
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public String getAccess_right() {
return access_right;
}
public void setAccess_right(String access_right) {
this.access_right = access_right;
}
public List<Community> getCommunities() {
return communities;
}
public void setCommunities(List<Community> communities) {
this.communities = communities;
}
public List<Creator> getCreators() {
return creators;
}
public void setCreators(List<Creator> creators) {
this.creators = creators;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public List<Grant> getGrants() {
return grants;
}
public void setGrants(List<Grant> grants) {
this.grants = grants;
}
public List<String> getKeywords() {
return keywords;
}
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public String getLicense() {
return license;
}
public void setLicense(String license) {
this.license = license;
}
public PrereserveDoi getPrereserve_doi() {
return prereserve_doi;
}
public void setPrereserve_doi(PrereserveDoi prereserve_doi) {
this.prereserve_doi = prereserve_doi;
}
public String getPublication_date() {
return publication_date;
}
public void setPublication_date(String publication_date) {
this.publication_date = publication_date;
}
public List<String> getReferences() {
return references;
}
public void setReferences(List<String> references) {
this.references = references;
}
public List<RelatedIdentifier> getRelated_identifiers() {
return related_identifiers;
}
public void setRelated_identifiers(List<RelatedIdentifier> related_identifiers) {
this.related_identifiers = related_identifiers;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class PrereserveDoi implements Serializable {
private String doi;
private String recid;
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public String getRecid() {
return recid;
}
public void setRecid(String recid) {
this.recid = recid;
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class RelatedIdentifier implements Serializable {
private String identifier;
private String relation;
private String resource_type;
private String scheme;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public String getRelation() {
return relation;
}
public void setRelation(String relation) {
this.relation = relation;
}
public String getResource_type() {
return resource_type;
}
public void setResource_type(String resource_type) {
this.resource_type = resource_type;
}
public String getScheme() {
return scheme;
}
public void setScheme(String scheme) {
this.scheme = scheme;
}
}

View File

@ -0,0 +1,118 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import java.util.List;
public class ZenodoModel implements Serializable {
private String conceptrecid;
private String created;
private List<File> files;
private String id;
private Links links;
private Metadata metadata;
private String modified;
private String owner;
private String record_id;
private String state;
private boolean submitted;
private String title;
public String getConceptrecid() {
return conceptrecid;
}
public void setConceptrecid(String conceptrecid) {
this.conceptrecid = conceptrecid;
}
public String getCreated() {
return created;
}
public void setCreated(String created) {
this.created = created;
}
public List<File> getFiles() {
return files;
}
public void setFiles(List<File> files) {
this.files = files;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Links getLinks() {
return links;
}
public void setLinks(Links links) {
this.links = links;
}
public Metadata getMetadata() {
return metadata;
}
public void setMetadata(Metadata metadata) {
this.metadata = metadata;
}
public String getModified() {
return modified;
}
public void setModified(String modified) {
this.modified = modified;
}
public String getOwner() {
return owner;
}
public void setOwner(String owner) {
this.owner = owner;
}
public String getRecord_id() {
return record_id;
}
public void setRecord_id(String record_id) {
this.record_id = record_id;
}
public String getState() {
return state;
}
public void setState(String state) {
this.state = state;
}
public boolean isSubmitted() {
return submitted;
}
public void setSubmitted(boolean submitted) {
this.submitted = submitted;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
}

View File

@ -0,0 +1,7 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.util.ArrayList;
public class ZenodoModelList extends ArrayList<ZenodoModel> {
}

View File

@ -12,7 +12,9 @@ import java.util.concurrent.TimeUnit;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.apache.http.HttpHeaders; import org.apache.http.HttpHeaders;
import org.joda.time.Instant;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -212,11 +214,11 @@ public class HttpConnector2 {
.format( .format(
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(), "Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
MAPPER.writeValueAsString(report))); MAPPER.writeValueAsString(report)));
} catch (MalformedURLException e) { } catch (MalformedURLException | UnknownHostException e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage()); report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e.getMessage(), e); throw new CollectorException(e.getMessage(), e);
} catch (SocketTimeoutException | SocketException | UnknownHostException e) { } catch (SocketTimeoutException | SocketException e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage()); report.put(e.getClass().getName(), e.getMessage());
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000); backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);

View File

@ -135,7 +135,7 @@ public class GroupEntitiesSparkJob {
.applyCoarVocabularies(entity, vocs), .applyCoarVocabularies(entity, vocs),
OAFENTITY_KRYO_ENC) OAFENTITY_KRYO_ENC)
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING()) .groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC) .mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeGroup, OAFENTITY_KRYO_ENC)
.map( .map(
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>( (MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
t.getClass().getName(), t), t.getClass().getName(), t),

View File

@ -1,70 +0,0 @@
/*
* Copyright (c) 2024.
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package eu.dnetlib.dhp.schema.oaf;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
public class HashableStructuredProperty extends StructuredProperty {
private static final long serialVersionUID = 8371670185221126045L;
public static HashableStructuredProperty newInstance(String value, Qualifier qualifier, DataInfo dataInfo) {
if (value == null) {
return null;
}
final HashableStructuredProperty sp = new HashableStructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static HashableStructuredProperty newInstance(StructuredProperty sp) {
HashableStructuredProperty hsp = new HashableStructuredProperty();
hsp.setQualifier(sp.getQualifier());
hsp.setValue(sp.getValue());
hsp.setQualifier(sp.getQualifier());
return hsp;
}
public static StructuredProperty toStructuredProperty(HashableStructuredProperty hsp) {
StructuredProperty sp = new StructuredProperty();
sp.setQualifier(hsp.getQualifier());
sp.setValue(hsp.getValue());
sp.setQualifier(hsp.getQualifier());
return sp;
}
@Override
public int hashCode() {
return new HashCodeBuilder(11, 91)
.append(getQualifier().getClassid())
.append(getQualifier().getSchemeid())
.append(getValue())
.hashCode();
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (obj == this) {
return true;
}
if (obj.getClass() != getClass()) {
return false;
}
final HashableStructuredProperty rhs = (HashableStructuredProperty) obj;
return new EqualsBuilder()
.append(getQualifier().getClassid(), rhs.getQualifier().getClassid())
.append(getQualifier().getSchemeid(), rhs.getQualifier().getSchemeid())
.append(getValue(), rhs.getValue())
.isEquals();
}
}

View File

@ -43,4 +43,34 @@ public class CleaningFunctions {
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue); return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
} }
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
pid
.setValue(
normalizePidValue(
pid.getQualifier().getClassid(),
pid.getValue()));
return pid;
}
public static String normalizePidValue(String pidType, String pidValue) {
String value = Optional
.ofNullable(pidValue)
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pidType) {
// TODO add cleaning for more PID types as needed
case "doi":
return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX);
}
return value;
}
} }

View File

@ -6,11 +6,18 @@ import org.apache.commons.lang3.StringUtils;
public class DoiCleaningRule { public class DoiCleaningRule {
public static String clean(final String doi) { public static String clean(final String doi) {
if (doi == null) return doi
return null; .toLowerCase()
final String replaced = doi .replaceAll("\\s", "")
.replaceAll("\\n|\\r|\\t|\\s", "")
.replaceAll("^doi:", "") .replaceAll("^doi:", "")
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
}
public static String normalizeDoi(final String input) {
if (input == null)
return null;
final String replaced = input
.replaceAll("\\n|\\r|\\t|\\s", "")
.toLowerCase() .toLowerCase()
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX); .replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
if (StringUtils.isEmpty(replaced)) if (StringUtils.isEmpty(replaced))
@ -25,6 +32,7 @@ public class DoiCleaningRule {
return null; return null;
return ret; return ret;
} }
} }

View File

@ -119,7 +119,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.getContext() .getContext()
.stream() .stream()
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId)) .filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
.collect(Collectors.toCollection(ArrayList::new))); .collect(Collectors.toList()));
} }
return (T) res; return (T) res;
} else { } else {
@ -563,24 +563,12 @@ public class GraphCleaningFunctions extends CleaningFunctions {
Optional Optional
.ofNullable(i.getPid()) .ofNullable(i.getPid())
.ifPresent(pid -> { .ifPresent(pid -> {
final Set<HashableStructuredProperty> pids = pid final Set<StructuredProperty> pids = Sets.newHashSet(pid);
.stream()
.map(HashableStructuredProperty::newInstance)
.collect(Collectors.toCollection(HashSet::new));
Optional Optional
.ofNullable(i.getAlternateIdentifier()) .ofNullable(i.getAlternateIdentifier())
.ifPresent(altId -> { .ifPresent(altId -> {
final Set<HashableStructuredProperty> altIds = altId final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
.stream() i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
.map(HashableStructuredProperty::newInstance)
.collect(Collectors.toCollection(HashSet::new));
i
.setAlternateIdentifier(
Sets
.difference(altIds, pids)
.stream()
.map(HashableStructuredProperty::toStructuredProperty)
.collect(Collectors.toList()));
}); });
}); });
@ -1015,41 +1003,4 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.orElse(null); .orElse(null);
} }
/**
* Implements bad and ugly things that we should get rid of ASAP.
*
* @param value
* @return
* @param <T>
*/
public static <T extends Oaf> T dedicatedUglyHacks(T value) {
if (value instanceof OafEntity) {
if (value instanceof Result) {
final Result r = (Result) value;
// Fix for AMS Acta
Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.filter(
i -> Optional
.ofNullable(i.getHostedby())
.map(KeyValue::getKey)
.map(dsId -> dsId.equals("10|re3data_____::4cc76bed7ce2fb95fd8e7a2dfde16016"))
.orElse(false)))
.ifPresent(instance -> instance.forEach(i -> {
if (Optional
.ofNullable(i.getPid())
.map(pid -> pid.stream().noneMatch(p -> p.getValue().startsWith("10.6092/unibo/amsacta")))
.orElse(false)) {
i.setHostedby(UNKNOWN_REPOSITORY);
}
}));
}
}
return value;
}
} }

View File

@ -175,7 +175,7 @@ public class IdentifierFactory implements Serializable {
return entity return entity
.getPid() .getPid()
.stream() .stream()
.map(PidCleaner::normalizePidValue) .map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter) .filter(CleaningFunctions::pidFilter)
.collect( .collect(
Collectors Collectors
@ -207,7 +207,7 @@ public class IdentifierFactory implements Serializable {
// filter away PIDs provided by a DS that is not considered an authority for the // filter away PIDs provided by a DS that is not considered an authority for the
// given PID Type // given PID Type
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles)) .filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
.map(PidCleaner::normalizePidValue) .map(CleaningFunctions::normalizePidValue)
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p)) .filter(p -> isNotFromDelegatedAuthority(collectedFrom, p))
.filter(CleaningFunctions::pidFilter)) .filter(CleaningFunctions::pidFilter))
.orElse(Stream.empty()); .orElse(Stream.empty());

View File

@ -1,106 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.*;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result;
public class MergeEntitiesComparator implements Comparator<Oaf> {
static final List<String> PID_AUTHORITIES = Arrays
.asList(
ModelConstants.ARXIV_ID,
ModelConstants.PUBMED_CENTRAL_ID,
ModelConstants.EUROPE_PUBMED_CENTRAL_ID,
ModelConstants.DATACITE_ID,
ModelConstants.CROSSREF_ID);
static final List<String> RESULT_TYPES = Arrays
.asList(
ModelConstants.ORP_RESULTTYPE_CLASSID,
ModelConstants.SOFTWARE_RESULTTYPE_CLASSID,
ModelConstants.DATASET_RESULTTYPE_CLASSID,
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
public static final Comparator<Oaf> INSTANCE = new MergeEntitiesComparator();
@Override
public int compare(Oaf left, Oaf right) {
if (left == null && right == null)
return 0;
if (left == null)
return -1;
if (right == null)
return 1;
int res = 0;
// pid authority
int cfp1 = Optional
.ofNullable(left.getCollectedfrom())
.map(
cf -> cf
.stream()
.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
.max(Integer::compare)
.orElse(-1))
.orElse(-1);
int cfp2 = Optional
.ofNullable(right.getCollectedfrom())
.map(
cf -> cf
.stream()
.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
.max(Integer::compare)
.orElse(-1))
.orElse(-1);
if (cfp1 >= 0 && cfp1 > cfp2) {
return 1;
} else if (cfp2 >= 0 && cfp2 > cfp1) {
return -1;
}
// trust
if (left.getDataInfo() != null && right.getDataInfo() != null) {
res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust());
}
// result type
if (res == 0) {
if (left instanceof Result && right instanceof Result) {
Result r1 = (Result) left;
Result r2 = (Result) right;
if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) {
if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) {
return -1;
}
} else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) {
return 1;
}
int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid());
int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid());
if (rt1 >= 0 && rt1 > rt2) {
return 1;
} else if (rt2 >= 0 && rt2 > rt1) {
return -1;
}
}
}
// id
if (res == 0) {
if (left instanceof OafEntity && right instanceof OafEntity) {
res = ((OafEntity) right).getId().compareTo(((OafEntity) left).getId());
}
}
return res;
}
}

View File

@ -31,37 +31,39 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class MergeUtils { public class MergeUtils {
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
return mergeGroup(s, oafEntityIterator, true);
}
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) { public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
return mergeGroup(s, oafEntityIterator, false); TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
int res = 0;
if (o1.getDataInfo() != null && o2.getDataInfo() != null) {
res = o1.getDataInfo().getTrust().compareTo(o2.getDataInfo().getTrust());
} }
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator, if (res == 0) {
boolean checkDelegateAuthority) { if (o1 instanceof Result && o2 instanceof Result) {
return ResultTypeComparator.INSTANCE.compare((Result) o1, (Result) o2);
}
}
ArrayList<T> sortedEntities = new ArrayList<>(); return res;
oafEntityIterator.forEachRemaining(sortedEntities::add); });
sortedEntities.sort(MergeEntitiesComparator.INSTANCE.reversed());
Iterator<T> it = sortedEntities.iterator(); while (oafEntityIterator.hasNext()) {
T merged = it.next(); sortedEntities.add(oafEntityIterator.next());
}
T merged = sortedEntities.descendingIterator().next();
Iterator<T> it = sortedEntities.descendingIterator();
while (it.hasNext()) { while (it.hasNext()) {
merged = checkedMerge(merged, it.next(), checkDelegateAuthority); merged = checkedMerge(merged, it.next());
} }
return merged; return merged;
} }
public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) { public static <T extends Oaf> T checkedMerge(final T left, final T right) {
return (T) merge(left, right, checkDelegateAuthority); return (T) merge(left, right, false);
}
public static <T extends Result, E extends Result> Result mergeResult(final T left, final E right) {
return (Result) merge(left, right, false);
} }
public static Oaf merge(final Oaf left, final Oaf right) { public static Oaf merge(final Oaf left, final Oaf right) {
@ -89,7 +91,7 @@ public class MergeUtils {
private static Oaf mergeEntities(Oaf left, Oaf right, boolean checkDelegatedAuthority) { private static Oaf mergeEntities(Oaf left, Oaf right, boolean checkDelegatedAuthority) {
if (sameClass(left, right, Result.class)) { if (sameClass(left, right, Result.class)) {
if (checkDelegatedAuthority) { if (!left.getClass().equals(right.getClass()) || checkDelegatedAuthority) {
return mergeResultsOfDifferentTypes((Result) left, (Result) right); return mergeResultsOfDifferentTypes((Result) left, (Result) right);
} }
@ -106,7 +108,7 @@ public class MergeUtils {
return mergeSoftware((Software) left, (Software) right); return mergeSoftware((Software) left, (Software) right);
} }
return mergeResultFields((Result) left, (Result) right); return mergeResult((Result) left, (Result) right);
} else if (sameClass(left, right, Datasource.class)) { } else if (sameClass(left, right, Datasource.class)) {
// TODO // TODO
final int trust = compareTrust(left, right); final int trust = compareTrust(left, right);
@ -129,7 +131,7 @@ public class MergeUtils {
* https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers * https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers
* such version. * such version.
* <p> * <p>
* Otherwise, it considers a resulttype priority order implemented in {@link MergeEntitiesComparator} * Otherwise, it considers a resulttype priority order implemented in {@link ResultTypeComparator}
* and proceeds with the canonical property merging. * and proceeds with the canonical property merging.
* *
* @param left * @param left
@ -147,12 +149,11 @@ public class MergeUtils {
if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) { if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
return right; return right;
} }
// TODO: raise trust to have preferred fields from one or the other?? // TODO: raise trust to have preferred fields from one or the other??
if (MergeEntitiesComparator.INSTANCE.compare(left, right) > 0) { if (new ResultTypeComparator().compare(left, right) < 0) {
return mergeResultFields(left, right); return mergeResult(left, right);
} else { } else {
return mergeResultFields(right, left); return mergeResult(right, left);
} }
} }
@ -212,9 +213,9 @@ public class MergeUtils {
private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust, private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
Function<T, K> keyExtractor, BinaryOperator<T> merger) { Function<T, K> keyExtractor, BinaryOperator<T> merger) {
if (left == null || left.isEmpty()) { if (left == null) {
return right != null ? right : new ArrayList<>(); return right;
} else if (right == null || right.isEmpty()) { } else if (right == null) {
return left; return left;
} }
@ -262,12 +263,6 @@ public class MergeUtils {
// TODO review // TODO review
private static List<KeyValue> mergeByKey(List<KeyValue> left, List<KeyValue> right, int trust) { private static List<KeyValue> mergeByKey(List<KeyValue> left, List<KeyValue> right, int trust) {
if (left == null) {
return right;
} else if (right == null) {
return left;
}
if (trust < 0) { if (trust < 0) {
List<KeyValue> s = left; List<KeyValue> s = left;
left = right; left = right;
@ -329,7 +324,7 @@ public class MergeUtils {
final T merged = mergeOafFields(original, enrich, trust); final T merged = mergeOafFields(original, enrich, trust);
merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId())); merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId()));
merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1)); merged.setPid(unionDistinctLists(merged.getPid(), enrich.getPid(), trust));
merged.setDateofcollection(LocalDateTime.now().toString()); merged.setDateofcollection(LocalDateTime.now().toString());
merged merged
.setDateoftransformation( .setDateoftransformation(
@ -373,7 +368,7 @@ public class MergeUtils {
return merge; return merge;
} }
private static <T extends Result> T mergeResultFields(T original, T enrich) { public static <T extends Result> T mergeResult(T original, T enrich) {
final int trust = compareTrust(original, enrich); final int trust = compareTrust(original, enrich);
T merge = mergeOafEntityFields(original, enrich, trust); T merge = mergeOafEntityFields(original, enrich, trust);
@ -392,7 +387,7 @@ public class MergeUtils {
} }
// should be an instance attribute, get the first non-null value // should be an instance attribute, get the first non-null value
merge.setLanguage(coalesceQualifier(merge.getLanguage(), enrich.getLanguage())); merge.setLanguage(coalesce(merge.getLanguage(), enrich.getLanguage()));
// distinct countries, do not manage datainfo // distinct countries, do not manage datainfo
merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust)); merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust));
@ -433,10 +428,7 @@ public class MergeUtils {
// merge datainfo for same context id // merge datainfo for same context id
merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> { merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> {
ArrayList<DataInfo> di = new ArrayList<>(); r.getDataInfo().addAll(l.getDataInfo());
di.addAll(r.getDataInfo());
di.addAll(l.getDataInfo());
r.setDataInfo(di);
return r; return r;
})); }));
@ -468,10 +460,6 @@ public class MergeUtils {
merge.setIsInDiamondJournal(booleanOR(merge.getIsInDiamondJournal(), enrich.getIsInDiamondJournal())); merge.setIsInDiamondJournal(booleanOR(merge.getIsInDiamondJournal(), enrich.getIsInDiamondJournal()));
merge.setPubliclyFunded(booleanOR(merge.getPubliclyFunded(), enrich.getPubliclyFunded())); merge.setPubliclyFunded(booleanOR(merge.getPubliclyFunded(), enrich.getPubliclyFunded()));
if (StringUtils.isBlank(merge.getTransformativeAgreement())) {
merge.setTransformativeAgreement(enrich.getTransformativeAgreement());
}
return merge; return merge;
} }
@ -569,13 +557,6 @@ public class MergeUtils {
return m != null ? m : e; return m != null ? m : e;
} }
private static Qualifier coalesceQualifier(Qualifier m, Qualifier e) {
if (m == null || m.getClassid() == null || StringUtils.isBlank(m.getClassid())) {
return e;
}
return m;
}
private static List<Author> mergeAuthors(List<Author> author, List<Author> author1, int trust) { private static List<Author> mergeAuthors(List<Author> author, List<Author> author1, int trust) {
List<List<Author>> authors = new ArrayList<>(); List<List<Author>> authors = new ArrayList<>();
if (author != null) { if (author != null) {
@ -588,10 +569,6 @@ public class MergeUtils {
} }
private static String instanceKeyExtractor(Instance i) { private static String instanceKeyExtractor(Instance i) {
// three levels of concatenating:
// 1. ::
// 2. @@
// 3. ||
return String return String
.join( .join(
"::", "::",
@ -599,10 +576,10 @@ public class MergeUtils {
kvKeyExtractor(i.getCollectedfrom()), kvKeyExtractor(i.getCollectedfrom()),
qualifierKeyExtractor(i.getAccessright()), qualifierKeyExtractor(i.getAccessright()),
qualifierKeyExtractor(i.getInstancetype()), qualifierKeyExtractor(i.getInstancetype()),
Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null), Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
Optional Optional
.ofNullable(i.getPid()) .ofNullable(i.getPid())
.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@"))) .map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
.orElse(null)); .orElse(null));
} }
@ -663,13 +640,6 @@ public class MergeUtils {
return d1; return d1;
} }
if (StringUtils.contains(d1.getValue(), "null")) {
return d2;
}
if (StringUtils.contains(d2.getValue(), "null")) {
return d1;
}
return Stream return Stream
.of(d1, d2) .of(d1, d2)
.min( .min(
@ -718,13 +688,13 @@ public class MergeUtils {
private static String spKeyExtractor(StructuredProperty sp) { private static String spKeyExtractor(StructuredProperty sp) {
return Optional return Optional
.ofNullable(sp) .ofNullable(sp)
.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) .map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
.orElse(null); .orElse(null);
} }
private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) { private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
int trust = compareTrust(original, enrich); int trust = compareTrust(original, enrich);
final T merge = mergeResultFields(original, enrich); final T merge = mergeResult(original, enrich);
merge.setContactperson(unionDistinctLists(merge.getContactperson(), enrich.getContactperson(), trust)); merge.setContactperson(unionDistinctLists(merge.getContactperson(), enrich.getContactperson(), trust));
merge.setContactgroup(unionDistinctLists(merge.getContactgroup(), enrich.getContactgroup(), trust)); merge.setContactgroup(unionDistinctLists(merge.getContactgroup(), enrich.getContactgroup(), trust));
@ -735,7 +705,7 @@ public class MergeUtils {
private static <T extends Software> T mergeSoftware(T original, T enrich) { private static <T extends Software> T mergeSoftware(T original, T enrich) {
int trust = compareTrust(original, enrich); int trust = compareTrust(original, enrich);
final T merge = mergeResultFields(original, enrich); final T merge = mergeResult(original, enrich);
merge.setDocumentationUrl(unionDistinctLists(merge.getDocumentationUrl(), enrich.getDocumentationUrl(), trust)); merge.setDocumentationUrl(unionDistinctLists(merge.getDocumentationUrl(), enrich.getDocumentationUrl(), trust));
merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust)); merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust));
@ -749,7 +719,7 @@ public class MergeUtils {
private static <T extends Dataset> T mergeDataset(T original, T enrich) { private static <T extends Dataset> T mergeDataset(T original, T enrich) {
int trust = compareTrust(original, enrich); int trust = compareTrust(original, enrich);
T merge = mergeResultFields(original, enrich); T merge = mergeResult(original, enrich);
merge.setStoragedate(chooseReference(merge.getStoragedate(), enrich.getStoragedate(), trust)); merge.setStoragedate(chooseReference(merge.getStoragedate(), enrich.getStoragedate(), trust));
merge.setDevice(chooseReference(merge.getDevice(), enrich.getDevice(), trust)); merge.setDevice(chooseReference(merge.getDevice(), enrich.getDevice(), trust));
@ -768,7 +738,7 @@ public class MergeUtils {
public static <T extends Publication> T mergePublication(T original, T enrich) { public static <T extends Publication> T mergePublication(T original, T enrich) {
final int trust = compareTrust(original, enrich); final int trust = compareTrust(original, enrich);
T merged = mergeResultFields(original, enrich); T merged = mergeResult(original, enrich);
merged.setJournal(chooseReference(merged.getJournal(), enrich.getJournal(), trust)); merged.setJournal(chooseReference(merged.getJournal(), enrich.getJournal(), trust));
@ -886,11 +856,9 @@ public class MergeUtils {
if (toEnrichInstances == null) { if (toEnrichInstances == null) {
return enrichmentResult; return enrichmentResult;
} }
if (enrichmentInstances == null) {
if (enrichmentInstances == null || enrichmentInstances.isEmpty()) { return enrichmentResult;
return toEnrichInstances;
} }
Map<String, Instance> ri = toInstanceMap(enrichmentInstances); Map<String, Instance> ri = toInstanceMap(enrichmentInstances);
toEnrichInstances.forEach(i -> { toEnrichInstances.forEach(i -> {
@ -975,7 +943,7 @@ public class MergeUtils {
private static String extractKeyFromPid(final StructuredProperty pid) { private static String extractKeyFromPid(final StructuredProperty pid) {
if (pid == null) if (pid == null)
return null; return null;
final StructuredProperty normalizedPid = PidCleaner.normalizePidValue(pid); final StructuredProperty normalizedPid = CleaningFunctions.normalizePidValue(pid);
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue()); return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
} }

View File

@ -18,8 +18,8 @@ public class PidValueComparator implements Comparator<StructuredProperty> {
if (right == null) if (right == null)
return -1; return -1;
StructuredProperty l = PidCleaner.normalizePidValue(left); StructuredProperty l = CleaningFunctions.normalizePidValue(left);
StructuredProperty r = PidCleaner.normalizePidValue(right); StructuredProperty r = CleaningFunctions.normalizePidValue(right);
return Optional return Optional
.ofNullable(l.getValue()) .ofNullable(l.getValue())

View File

@ -0,0 +1,78 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Optional;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result;
public class ResultTypeComparator implements Comparator<Result> {
public static final ResultTypeComparator INSTANCE = new ResultTypeComparator();
@Override
public int compare(Result left, Result right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
HashSet<String> lCf = getCollectedFromIds(left);
HashSet<String> rCf = getCollectedFromIds(right);
if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
return -1;
}
if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
return 1;
}
String lClass = left.getResulttype().getClassid();
String rClass = right.getResulttype().getClassid();
if (!lClass.equals(rClass)) {
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return 1;
}
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
protected HashSet<String> getCollectedFromIds(Result left) {
return Optional
.ofNullable(left.getCollectedfrom())
.map(
cf -> cf
.stream()
.map(KeyValue::getKey)
.collect(Collectors.toCollection(HashSet::new)))
.orElse(new HashSet<>());
}
}

View File

@ -28,7 +28,6 @@ import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo; import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import net.minidev.json.JSONArray; import net.minidev.json.JSONArray;
import scala.collection.JavaConverters; import scala.collection.JavaConverters;
import scala.collection.Seq; import scala.collection.Seq;
@ -105,7 +104,7 @@ public class DHPUtils {
public static String generateUnresolvedIdentifier(final String pid, final String pidType) { public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
final String cleanedPid = PidCleaner.normalizePidValue(pidType, pid); final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim()); return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
} }

View File

@ -1,101 +0,0 @@
package eu.dnetlib.pace.common;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
/**
* Set of common functions for the framework
*
* @author claudio
*/
public class PaceCommonUtils {
// transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
s.chars().forEach(ch -> {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
});
return sb.toString();
}
protected static String transliterate(final String s) {
try {
return transliterator.transliterate(s);
} catch (Exception e) {
return s;
}
}
public static String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
// strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
public static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public static String unicodeNormalization(final String s) {
Matcher m = hexUnicodePattern.matcher(s);
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
public static Set<String> loadFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils
.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
protected static Iterable<String> tokens(final String s, final int maxTokens) {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
}
}

View File

@ -154,13 +154,5 @@
"unknown":{ "unknown":{
"original":"Unknown", "original":"Unknown",
"inverse":"Unknown" "inverse":"Unknown"
},
"isamongtopnsimilardocuments": {
"original": "IsAmongTopNSimilarDocuments",
"inverse": "HasAmongTopNSimilarDocuments"
},
"hasamongtopnsimilardocuments": {
"original": "HasAmongTopNSimilarDocuments",
"inverse": "IsAmongTopNSimilarDocuments"
} }
} }

View File

@ -65,13 +65,12 @@ abstract class AbstractScalaApplication(
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val master = parser.get("master") val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master") log.info(s"Creating Spark session: Master: $master")
val b = SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
if (master != null) .master(master)
b.master(master) .getOrCreate()
b.getOrCreate()
} }
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = { def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {

View File

@ -65,11 +65,7 @@ object ScholixUtils extends Serializable {
} }
def generateScholixResourceFromResult(r: Result): ScholixResource = { def generateScholixResourceFromResult(r: Result): ScholixResource = {
val sum = ScholixUtils.resultToSummary(r)
if (sum != null)
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r)) generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
else
null
} }
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@ -157,14 +153,6 @@ object ScholixUtils extends Serializable {
} }
def invRel(rel: String): String = {
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
if (semanticRelation != null)
semanticRelation.inverse
else
null
}
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = { def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) { if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d => val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@ -389,7 +377,10 @@ object ScholixUtils extends Serializable {
if (persistentIdentifiers.isEmpty) if (persistentIdentifiers.isEmpty)
return null return null
s.setLocalIdentifier(persistentIdentifiers.asJava) s.setLocalIdentifier(persistentIdentifiers.asJava)
// s.setTypology(r.getResulttype.getClassid) if (r.isInstanceOf[Publication])
s.setTypology(Typology.publication)
else
s.setTypology(Typology.dataset)
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)

View File

@ -0,0 +1,109 @@
package eu.dnetlib.dhp.common.api;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@Disabled
class ZenodoAPIClientTest {
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
private final String ACCESS_TOKEN = "";
private final String CONCEPT_REC_ID = "657113";
private final String depositionId = "674915";
@Test
void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
Assertions.assertEquals(200, client.sendMretadata(metadata));
Assertions.assertEquals(202, client.publish());
}
@Test
void testNewDeposition() throws IOException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(201, client.newDeposition());
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
Assertions.assertEquals(200, client.sendMretadata(metadata));
Assertions.assertEquals(202, client.publish());
}
@Test
void testNewVersionNewName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/newVersion")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
Assertions.assertEquals(202, client.publish());
}
@Test
void testNewVersionOldName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/newVersion2")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
Assertions.assertEquals(202, client.publish());
}
}

View File

@ -29,7 +29,7 @@ class IdentifierFactoryTest {
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true); "publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
verifyIdentifier( verifyIdentifier(
"publication_doi3.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true); "publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
verifyIdentifier( verifyIdentifier(
"publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true); "publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true);
@ -41,7 +41,7 @@ class IdentifierFactoryTest {
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true); "publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
verifyIdentifier( verifyIdentifier(
"publication_pmc2.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true); "publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
verifyIdentifier( verifyIdentifier(
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true); "publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);

View File

@ -63,7 +63,7 @@ public class MergeUtilsTest {
assertEquals(1, d1.getCollectedfrom().size()); assertEquals(1, d1.getCollectedfrom().size());
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
final Result p1d2 = MergeUtils.checkedMerge(p1, d2, true); final Result p1d2 = MergeUtils.checkedMerge(p1, d2);
assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid()); assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid());
assertTrue(p1d2 instanceof Publication); assertTrue(p1d2 instanceof Publication);
assertEquals(p1.getId(), p1d2.getId()); assertEquals(p1.getId(), p1d2.getId());
@ -74,7 +74,7 @@ public class MergeUtilsTest {
Publication p2 = read("publication_2.json", Publication.class); Publication p2 = read("publication_2.json", Publication.class);
Dataset d1 = read("dataset_1.json", Dataset.class); Dataset d1 = read("dataset_1.json", Dataset.class);
final Result p2d1 = MergeUtils.checkedMerge(p2, d1, true); final Result p2d1 = MergeUtils.checkedMerge(p2, d1);
assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid()); assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid());
assertTrue(p2d1 instanceof Dataset); assertTrue(p2d1 instanceof Dataset);
assertEquals(d1.getId(), p2d1.getId()); assertEquals(d1.getId(), p2d1.getId());
@ -86,7 +86,7 @@ public class MergeUtilsTest {
Publication p1 = read("publication_1.json", Publication.class); Publication p1 = read("publication_1.json", Publication.class);
Publication p2 = read("publication_2.json", Publication.class); Publication p2 = read("publication_2.json", Publication.class);
Result p1p2 = MergeUtils.checkedMerge(p1, p2, true); Result p1p2 = MergeUtils.checkedMerge(p1, p2);
assertTrue(p1p2 instanceof Publication); assertTrue(p1p2 instanceof Publication);
assertEquals(p1.getId(), p1p2.getId()); assertEquals(p1.getId(), p1p2.getId());
assertEquals(2, p1p2.getCollectedfrom().size()); assertEquals(2, p1p2.getCollectedfrom().size());

View File

@ -177,7 +177,7 @@ class OafMapperUtilsTest {
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals( assertEquals(
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, ModelConstants.DATASET_RESULTTYPE_CLASSID,
((Result) MergeUtils ((Result) MergeUtils
.merge(p2, d1)) .merge(p2, d1))
.getResulttype() .getResulttype()

View File

@ -29,7 +29,7 @@
}, },
{ {
"qualifier": {"classid": "pmc"}, "qualifier": {"classid": "pmc"},
"value": "PMC21459329" "value": "21459329"
} }
] ]
} }

View File

@ -13,7 +13,7 @@
}, },
{ {
"qualifier":{"classid":"pmc"}, "qualifier":{"classid":"pmc"},
"value":"PMC21459329" "value":"21459329"
} }
] ]
} }

View File

@ -24,7 +24,7 @@
<executions> <executions>
<execution> <execution>
<id>scala-compile-first</id> <id>scala-compile-first</id>
<phase>process-resources</phase> <phase>initialize</phase>
<goals> <goals>
<goal>add-source</goal> <goal>add-source</goal>
<goal>compile</goal> <goal>compile</goal>
@ -49,16 +49,18 @@
</build> </build>
<dependencies> <dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>edu.cmu</groupId> <groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId> <artifactId>secondstring</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId> <artifactId>commons-lang3</artifactId>
@ -83,6 +85,10 @@
<groupId>com.fasterxml.jackson.core</groupId> <groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId> <artifactId>jackson-databind</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</dependency>
<dependency> <dependency>
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>
@ -101,90 +107,4 @@
</dependency> </dependency>
</dependencies> </dependencies>
<profiles>
<profile>
<id>spark-24</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-2</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>spark-34</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-2</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>spark-35</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-35</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project> </project>

View File

@ -2,41 +2,31 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.*; import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
@ClusteringClass("legalnameclustering") @ClusteringClass("keywordsclustering")
public class LegalnameClustering extends AbstractClusteringFunction { public class KeywordsClustering extends AbstractClusteringFunction {
private static final Pattern CITY_CODE_PATTERN = Pattern.compile("city::\\d+"); public KeywordsClustering(Map<String, Object> params) {
private static final Pattern KEYWORD_CODE_PATTERN = Pattern.compile("key::\\d+");
public LegalnameClustering(Map<String, Object> params) {
super(params); super(params);
} }
public Set<String> getRegexList(String input, Pattern codeRegex) {
Matcher matcher = codeRegex.matcher(input);
Set<String> cities = new HashSet<>();
while (matcher.find()) {
cities.add(matcher.group());
}
return cities;
}
@Override @Override
protected Collection<String> doApply(final Config conf, String s) { protected Collection<String> doApply(final Config conf, String s) {
// takes city codes and keywords codes without duplicates
Set<String> keywords = getKeywords(s, conf.translationMap(), paramOrDefault("windowSize", 4));
Set<String> cities = getCities(s, paramOrDefault("windowSize", 4));
// list of combination to return as result // list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>(); final Collection<String> combinations = new LinkedHashSet<String>();
for (String keyword : getRegexList(s, KEYWORD_CODE_PATTERN)) { for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
for (String city : getRegexList(s, CITY_CODE_PATTERN)) { for (String city : citiesToCodes(cities)) {
combinations.add(keyword + "-" + city); combinations.add(keyword + "-" + city);
if (combinations.size() >= paramOrDefault("max", 2)) { if (combinations.size() >= paramOrDefault("max", 2)) {
return combinations; return combinations;
@ -52,6 +42,9 @@ public class LegalnameClustering extends AbstractClusteringFunction {
return fields return fields
.stream() .stream()
.filter(f -> !f.isEmpty()) .filter(f -> !f.isEmpty())
.map(KeywordsClustering::cleanup)
.map(KeywordsClustering::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s)) .map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist)) .map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream()) .flatMap(c -> c.stream())

View File

@ -4,6 +4,7 @@ package eu.dnetlib.pace.common;
import java.io.IOException; import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -13,28 +14,24 @@ import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
/** /**
* Set of common functions for the framework * Set of common functions for the framework
* *
* @author claudio * @author claudio
*/ */
public class AbstractPaceFunctions extends PaceCommonUtils { public class AbstractPaceFunctions {
// city map to be used when translating the city names into codes // city map to be used when translating the city names into codes
private static Map<String, String> cityMap = AbstractPaceFunctions private static Map<String, String> cityMap = AbstractPaceFunctions
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); .loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
// keywords map to be used when translating the keyword names into codes
private static Map<String, String> keywordMap = AbstractPaceFunctions
.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
// country map to be used when inferring the country from the city name
private static Map<String, String> countryMap = AbstractPaceFunctions
.loadCountryMapFromClasspath("/eu/dnetlib/pace/config/country_map.csv");
// list of stopwords in different languages // list of stopwords in different languages
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt"); protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
@ -44,6 +41,9 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
// transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
// blacklist of ngrams: to avoid generic keys // blacklist of ngrams: to avoid generic keys
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
@ -51,6 +51,8 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>"); public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
// doi prefix for normalization // doi prefix for normalization
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"); public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
@ -82,64 +84,6 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return s12; return s12;
} }
public static String countryInference(final String original, String inferFrom) {
if (!original.equalsIgnoreCase("unknown"))
return original;
inferFrom = cleanup(inferFrom);
inferFrom = normalize(inferFrom);
inferFrom = filterAllStopWords(inferFrom);
Set<String> cities = getCities(inferFrom, 4);
return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
}
public static String cityInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> cities = getCities(original, 4);
for (String city : cities) {
original = original.replaceAll(city, cityMap.get(city));
}
return original;
}
public static String keywordInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> keywords = getKeywords(original, keywordMap, 4);
for (String keyword : keywords) {
original = original.replaceAll(keyword, keywordMap.get(keyword));
}
return original;
}
public static String cityKeywordInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> keywords = getKeywords(original, keywordMap, 4);
Set<String> cities = getCities(original, 4);
for (String keyword : keywords) {
original = original.replaceAll(keyword, keywordMap.get(keyword));
}
for (String city : cities) {
original = original.replaceAll(city, cityMap.get(city));
}
return original;
}
protected static String fixXML(final String a) { protected static String fixXML(final String a) {
return a return a
@ -185,6 +129,25 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return numberPattern.matcher(strNum).matches(); return numberPattern.matcher(strNum).matches();
} }
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
s.chars().forEach(ch -> {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
});
return sb.toString();
}
protected static String transliterate(final String s) {
try {
return transliterator.transliterate(s);
} catch (Exception e) {
return s;
}
}
protected static String removeSymbols(final String s) { protected static String removeSymbols(final String s) {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();
@ -199,6 +162,23 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return s != null; return s != null;
} }
public static String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
// strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
public static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public static String utf8(final String s) { public static String utf8(final String s) {
byte[] bytes = s.getBytes(StandardCharsets.UTF_8); byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
return new String(bytes, StandardCharsets.UTF_8); return new String(bytes, StandardCharsets.UTF_8);
@ -253,6 +233,22 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return newset; return newset;
} }
public static Set<String> loadFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils
.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
public static Map<String, String> loadMapFromClasspath(final String classpath) { public static Map<String, String> loadMapFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng"); Transliterator transliterator = Transliterator.getInstance("Any-Eng");
@ -274,30 +270,6 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return m; return m;
} }
public static Map<String, String> loadCountryMapFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils
.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
// string is like this: country_code;city1;city2;city3
String[] line = s.split(";");
String value = line[0];
for (int i = 1; i < line.length; i++) {
String city = fixAliases(transliterator.transliterate(line[i].toLowerCase()));
String code = cityMap.get(city);
m.put(code, value);
}
}
} catch (final Throwable e) {
return new HashMap<>();
}
return m;
}
public static String removeKeywords(String s, Set<String> keywords) { public static String removeKeywords(String s, Set<String> keywords) {
s = " " + s + " "; s = " " + s + " ";
@ -327,14 +299,14 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return toCodes(keywords, cityMap); return toCodes(keywords, cityMap);
} }
public static Set<String> citiesToCountry(Set<String> cities) {
return toCodes(toCodes(cities, cityMap), countryMap);
}
protected static String firstLC(final String s) { protected static String firstLC(final String s) {
return StringUtils.substring(s, 0, 1).toLowerCase(); return StringUtils.substring(s, 0, 1).toLowerCase();
} }
protected static Iterable<String> tokens(final String s, final int maxTokens) {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
}
public static String normalizePid(String pid) { public static String normalizePid(String pid) {
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll(""); return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
} }

View File

@ -47,21 +47,9 @@ public class FieldDef implements Serializable {
private String clean; private String clean;
private String infer;
private String inferenceFrom;
public FieldDef() { public FieldDef() {
} }
public String getInferenceFrom() {
return inferenceFrom;
}
public void setInferenceFrom(final String inferenceFrom) {
this.inferenceFrom = inferenceFrom;
}
public String getName() { public String getName() {
return name; return name;
} }
@ -138,14 +126,6 @@ public class FieldDef implements Serializable {
this.clean = clean; this.clean = clean;
} }
public String getInfer() {
return infer;
}
public void setInfer(String infer) {
this.infer = infer;
}
@Override @Override
public String toString() { public String toString() {
try { try {

View File

@ -12,7 +12,7 @@ import com.google.common.collect.Iterables;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import eu.dnetlib.pace.common.PaceCommonUtils; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.util.Capitalise; import eu.dnetlib.pace.util.Capitalise;
import eu.dnetlib.pace.util.DotAbbreviations; import eu.dnetlib.pace.util.DotAbbreviations;
@ -86,7 +86,7 @@ public class Person {
private List<String> splitTerms(final String s) { private List<String> splitTerms(final String s) {
if (particles == null) { if (particles == null) {
particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt"); particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
} }
final List<String> list = Lists.newArrayList(); final List<String> list = Lists.newArrayList();

View File

@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
import com.jayway.jsonpath.{Configuration, JsonPath} import com.jayway.jsonpath.{Configuration, JsonPath}
import eu.dnetlib.pace.common.AbstractPaceFunctions import eu.dnetlib.pace.common.AbstractPaceFunctions
import eu.dnetlib.pace.config.{DedupConfig, Type} import eu.dnetlib.pace.config.{DedupConfig, Type}
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils} import eu.dnetlib.pace.util.MapDocumentUtil
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName) val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => { val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema)) df.map(r => rowFromJson(r))(RowEncoder(schema))
} }
def rowFromJson(json: String): Row = { def rowFromJson(json: String): Row = {
@ -123,19 +123,9 @@ case class SparkModel(conf: DedupConfig) {
case _ => res(index) case _ => res(index)
} }
} }
if (StringUtils.isNotBlank(fdef.getInfer)) {
val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
res(index) = res(index) match {
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
}
}
} }
res res
} }
new GenericRowWithSchema(values, schema) new GenericRowWithSchema(values, schema)
@ -156,17 +146,5 @@ case class SparkModel(conf: DedupConfig) {
res res
} }
def inference(value: String, inferfrom: String, infertype: String) : String = {
val res = infertype match {
case "country" => AbstractPaceFunctions.countryInference(value, inferfrom)
case "city" => AbstractPaceFunctions.cityInference(value)
case "keyword" => AbstractPaceFunctions.keywordInference(value)
case "city_keyword" => AbstractPaceFunctions.cityKeywordInference(value)
case _ => value
}
res
}
} }

View File

@ -0,0 +1,48 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("cityMatch")
public class CityMatch extends AbstractStringComparator {
private Map<String, String> params;
public CityMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = citiesToCodes(cities1);
Set<String> codes2 = citiesToCodes(cities2);
// if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; // undefined if one of the two has no cities
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -1,51 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("codeMatch")
public class CodeMatch extends AbstractStringComparator {
private Map<String, String> params;
private Pattern CODE_REGEX;
public CodeMatch(Map<String, String> params) {
super(params);
this.params = params;
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
}
public Set<String> getRegexList(String input) {
Matcher matcher = this.CODE_REGEX.matcher(input);
Set<String> cities = new HashSet<>();
while (matcher.find()) {
cities.add(matcher.group());
}
return cities;
}
@Override
public double distance(final String a, final String b, final Config conf) {
Set<String> codes1 = getRegexList(a);
Set<String> codes2 = getRegexList(b);
// if no codes are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; // undefined if one of the two has no codes
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -1,54 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("countryMatch")
public class CountryMatch extends AbstractStringComparator {
private Map<String, String> params;
public CountryMatch(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public CountryMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1.0; // return -1 if a field is missing
}
if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
return -1.0; // return -1 if a country is UNKNOWN
}
return a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,59 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerLegalname")
public class JaroWinklerLegalname extends AbstractStringComparator {
private Map<String, String> params;
private final String CITY_CODE_REGEX = "city::\\d+";
private final String KEYWORD_CODE_REGEX = "key::\\d+";
public JaroWinklerLegalname(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerLegalname(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerLegalname(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = a.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
String cb = b.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,74 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends AbstractStringComparator {
private Map<String, String> params;
public JaroWinklerNormalizedName(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,50 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("keywordMatch")
public class KeywordMatch extends AbstractStringComparator {
Map<String, String> params;
public KeywordMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
// if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1.0; // undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -15,4 +15,4 @@ public class Capitalise implements Function<String, String> {
public String apply(final String s) { public String apply(final String s) {
return WordUtils.capitalize(s.toLowerCase(), DELIM); return WordUtils.capitalize(s.toLowerCase(), DELIM);
} }
} };

File diff suppressed because it is too large Load Diff

View File

@ -8,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
public String apply(String s) { public String apply(String s) {
return s.length() == 1 ? s + "." : s; return s.length() == 1 ? s + "." : s;
} }
} };

File diff suppressed because one or more lines are too long

View File

@ -1,12 +0,0 @@
package eu.dnetlib.pace.util
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
import org.apache.spark.sql.types.StructType
object SparkCompatUtils {
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
RowEncoder(schema)
}
}

View File

@ -1,12 +0,0 @@
package eu.dnetlib.pace.util
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.types.StructType
object SparkCompatUtils {
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
ExpressionEncoder(schema)
}
}

View File

@ -8,7 +8,6 @@ import org.junit.jupiter.api.Test;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.mongodb.connection.Cluster;
import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
@ -178,16 +177,41 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
} }
@Test @Test
public void legalnameClustering() { public void testKeywordsClustering() {
final ClusteringFunction cf = new LegalnameClustering(params); final ClusteringFunction cf = new KeywordsClustering(params);
String s = "key::1 key::2 city::1"; final String s = "Polytechnic University of Turin";
System.out.println(s); System.out.println(s);
System.out.println(cf.apply(conf, Lists.newArrayList(s))); System.out.println(cf.apply(conf, Lists.newArrayList(s)));
s = "key::1 key::2 city::1 city::2"; final String s1 = "POLITECNICO DI TORINO";
System.out.println(s); System.out.println(s1);
System.out.println(cf.apply(conf, Lists.newArrayList(s))); System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
final String s2 = "Universita farmaceutica culturale di milano bergamo";
System.out.println("s2 = " + s2);
System.out.println(cf.apply(conf, Lists.newArrayList(s2)));
final String s3 = "universita universita milano milano";
System.out.println("s3 = " + s3);
System.out.println(cf.apply(conf, Lists.newArrayList(s3)));
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
System.out.println("s4 = " + s4);
System.out.println(cf.apply(conf, Lists.newArrayList(s4)));
final String s5 = "İstanbul Ticarət Universiteti";
System.out.println("s5 = " + s5);
System.out.println(cf.apply(conf, Lists.newArrayList(s5)));
final String s6 = "National and Kapodistrian University of Athens";
System.out.println("s6 = " + s6);
System.out.println(cf.apply(conf, Lists.newArrayList(s6)));
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
System.out.println("s7 = " + s7);
System.out.println(cf.apply(conf, Lists.newArrayList(s7)));
} }
@Test @Test

View File

@ -54,47 +54,4 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
} }
@Test
public void countryInferenceTest() {
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
assertEquals("UK", countryInference("UK", "Università di Bologna"));
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
assertEquals("UNKNOWN", countryInference("UNKNOWN", "Università del Lavoro"));
}
@Test
public void cityInferenceTest() {
assertEquals("universita city::3181928", cityInference("Università di Bologna"));
assertEquals("university city::3170647", cityInference("University of Pisa"));
assertEquals("universita", cityInference("Università del lavoro"));
assertEquals("universita city::3173331 city::3169522", cityInference("Università di Modena e Reggio Emilia"));
}
@Test
public void keywordInferenceTest() {
assertEquals("key::41 turin", keywordInference("Polytechnic University of Turin"));
assertEquals("key::41 torino", keywordInference("POLITECNICO DI TORINO"));
assertEquals(
"key::1 key::60 key::81 milano bergamo",
keywordInference("Universita farmaceutica culturale di milano bergamo"));
assertEquals("key::1 key::1 milano milano", keywordInference("universita universita milano milano"));
assertEquals(
"key::10 kapodistriako panepistemio athenon",
keywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
}
@Test
public void cityKeywordInferenceTest() {
assertEquals("key::41 city::3165524", cityKeywordInference("Polytechnic University of Turin"));
assertEquals("key::41 city::3165524", cityKeywordInference("POLITECNICO DI TORINO"));
assertEquals(
"key::1 key::60 key::81 city::3173435 city::3182164",
cityKeywordInference("Universita farmaceutica culturale di milano bergamo"));
assertEquals(
"key::1 key::1 city::3173435 city::3173435", cityKeywordInference("universita universita milano milano"));
assertEquals(
"key::10 kapodistriako panepistemio city::264371",
cityKeywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
}
} }

View File

@ -35,7 +35,6 @@ public class ComparatorTest extends AbstractPaceTest {
params.put("name_th", "0.95"); params.put("name_th", "0.95");
params.put("jpath_value", "$.value"); params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid"); params.put("jpath_classid", "$.qualifier.classid");
params.put("codeRegex", "key::\\d+");
} }
@Test @Test
@ -45,23 +44,52 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test @Test
public void codeMatchTest() { public void cityMatchTest() {
CodeMatch codeMatch = new CodeMatch(params); final CityMatch cityMatch = new CityMatch(params);
// both names with no codes // both names with no cities
assertEquals(1.0, codeMatch.distance("testing1", "testing2", conf)); assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
// one of the two names with no codes // one of the two names with no cities
assertEquals(-1.0, codeMatch.distance("testing1 key::1", "testing", conf)); assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
// both names with codes (same) // both names with cities (same)
assertEquals(1.0, codeMatch.distance("testing1 key::1", "testing2 key::1", conf)); assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
// both names with codes (different) // both names with cities (different)
assertEquals(0.0, codeMatch.distance("testing1 key::1", "testing2 key::2", conf)); assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
// both names with codes (1 same, 1 different) // particular cases
assertEquals(0.5, codeMatch.distance("key::1 key::2 testing1", "key::1 testing", conf)); assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
assertEquals(
1.0,
cityMatch
.distance(
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
conf));
// failing becasuse 'Allen' is a transliterrated greek stopword
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
}
@Test
public void keywordMatchTest() {
params.put("threshold", "0.5");
final KeywordMatch keywordMatch = new KeywordMatch(params);
assertEquals(
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
assertEquals(2.0 / 3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
} }
@ -127,15 +155,15 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test @Test
public void jaroWinklerLegalnameTest() { public void jaroWinklerNormalizedNameTest() {
final JaroWinklerLegalname jaroWinklerLegalname = new JaroWinklerLegalname(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerLegalname double result = jaroWinklerNormalizedName
.distance("AT&T (United States)", "United States key::2 key::1", conf); .distance("AT&T (United States)", "United States Military Academy", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
result = jaroWinklerLegalname.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf); result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
} }
@ -308,23 +336,4 @@ public class ComparatorTest extends AbstractPaceTest {
System.out.println("compare = " + compare); System.out.println("compare = " + compare);
} }
@Test
public void countryMatch() {
CountryMatch countryMatch = new CountryMatch(params);
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
assertEquals(-1.0, result);
result = countryMatch.distance("CL", "UNKNOWN", conf);
assertEquals(-1.0, result);
result = countryMatch.distance("CL", "IT", conf);
assertEquals(0.0, result);
result = countryMatch.distance("CL", "CL", conf);
assertEquals(1.0, result);
}
} }

View File

@ -11,7 +11,6 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class UtilTest { public class UtilTest {

View File

@ -1,113 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<artifactId>dhp</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-shade-package</artifactId>
<description>This module create a jar of all module dependencies</description>
<build>
<plugins>
<plugin>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer>
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
</transformer>
<transformer />
<transformer>
<resource>META-INF/cxf/bus-extensions.txt</resource>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/maven/**</exclude>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<relocations>
<relocation>
<pattern>com</pattern>
<shadedPattern>repackaged.com.google.common</shadedPattern>
<includes>
<include>com.google.common.**</include>
</includes>
</relocation>
</relocations>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.28</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>5.6.1</version>
<scope>test</scope>
<exclusions>
<exclusion>
<artifactId>junit-jupiter-api</artifactId>
<groupId>org.junit.jupiter</groupId>
</exclusion>
<exclusion>
<artifactId>junit-jupiter-params</artifactId>
<groupId>org.junit.jupiter</groupId>
</exclusion>
<exclusion>
<artifactId>junit-jupiter-engine</artifactId>
<groupId>org.junit.jupiter</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>3.3.3</version>
<scope>test</scope>
<exclusions>
<exclusion>
<artifactId>byte-buddy</artifactId>
<groupId>net.bytebuddy</groupId>
</exclusion>
<exclusion>
<artifactId>byte-buddy-agent</artifactId>
<groupId>net.bytebuddy</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<version>3.3.3</version>
<scope>test</scope>
</dependency>
</dependencies>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
</project>

View File

@ -1,169 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.5-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dhp-shade-package</artifactId>
<packaging>jar</packaging>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
<description>This module create a jar of all module dependencies</description>
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-actionmanager</artifactId>
<version>${project.version}</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-aggregation</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-blacklist</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-broker-events</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-dedup-openaire</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-enrichment</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-mapper</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-provision</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-impact-indicators</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-actionsets</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-hist-snaps</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-monitor-irish</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-promote</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-update</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-swh</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-usage-raw-data-update</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-usage-stats-build</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
</transformer>
<!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/cxf/bus-extensions.txt</resource>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/maven/**</exclude>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<relocations>
<relocation>
<pattern>com</pattern>
<shadedPattern>repackaged.com.google.common</shadedPattern>
<includes>
<include>com.google.common.**</include>
</includes>
</relocation>
</relocations>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -51,5 +51,48 @@
<artifactId>hadoop-distcp</artifactId> <artifactId>hadoop-distcp</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon-dom</artifactId>
</exclusion>
<exclusion>
<groupId>jgrapht</groupId>
<artifactId>jgrapht</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
</exclusion>
<exclusion>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.*</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>apache</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager;
import java.io.Serializable; import java.io.Serializable;
import java.io.StringReader; import java.io.StringReader;
import java.util.List; import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -21,6 +22,7 @@ import com.google.common.base.Splitter;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.actionmanager.rmi.ActionManagerException;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -63,7 +65,7 @@ public class ISClient implements Serializable {
.map(t -> buildDirectory(basePath, t)) .map(t -> buildDirectory(basePath, t))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElseThrow(() -> new IllegalStateException("empty set list")); .orElseThrow(() -> new IllegalStateException("empty set list"));
} catch (ISLookUpException e) { } catch (ActionManagerException | ISLookUpException e) {
throw new IllegalStateException("unable to query ActionSets info from the IS"); throw new IllegalStateException("unable to query ActionSets info from the IS");
} }
} }
@ -87,18 +89,31 @@ public class ISClient implements Serializable {
return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight()); return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight());
} }
private String getBasePathHDFS(ISLookUpService isLookup) throws ISLookUpException { private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException {
return queryServiceProperty(isLookup, "basePath"); return queryServiceProperty(isLookup, "basePath");
} }
private String queryServiceProperty(ISLookUpService isLookup, final String propertyName) private String queryServiceProperty(ISLookUpService isLookup, final String propertyName)
throws ISLookUpException { throws ActionManagerException {
final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='"
+ propertyName + propertyName
+ "']/@value/string()"; + "']/@value/string()";
log.debug("quering for service property: {}", q); log.debug("quering for service property: {}", q);
try {
final List<String> value = isLookup.quickSearchProfile(q); final List<String> value = isLookup.quickSearchProfile(q);
return Iterables.getOnlyElement(value); return Iterables.getOnlyElement(value);
} catch (ISLookUpException e) {
String msg = "Error accessing service profile, using query: " + q;
log.error(msg, e);
throw new ActionManagerException(msg, e);
} catch (NoSuchElementException e) {
String msg = "missing service property: " + propertyName;
log.error(msg, e);
throw new ActionManagerException(msg, e);
} catch (IllegalArgumentException e) {
String msg = "found more than one service property: " + propertyName;
log.error(msg, e);
throw new ActionManagerException(msg, e);
}
} }
} }

View File

@ -103,7 +103,6 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -157,7 +156,6 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -95,7 +95,6 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -125,7 +125,6 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -135,10 +134,21 @@
<arg>--outputPath</arg><arg>${workingDir}/action_payload_by_type</arg> <arg>--outputPath</arg><arg>${workingDir}/action_payload_by_type</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark> </spark>
<ok to="PromoteActionPayloadForDatasetTable"/> <ok to="ForkPromote"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name="ForkPromote">
<path start="PromoteActionPayloadForDatasetTable"/>
<path start="PromoteActionPayloadForDatasourceTable"/>
<path start="PromoteActionPayloadForOrganizationTable"/>
<path start="PromoteActionPayloadForOtherResearchProductTable"/>
<path start="PromoteActionPayloadForProjectTable"/>
<path start="PromoteActionPayloadForPublicationTable"/>
<path start="PromoteActionPayloadForRelationTable"/>
<path start="PromoteActionPayloadForSoftwareTable"/>
</fork>
<action name="PromoteActionPayloadForDatasetTable"> <action name="PromoteActionPayloadForDatasetTable">
<sub-workflow> <sub-workflow>
<app-path>${wf:appPath()}/promote_action_payload_for_dataset_table</app-path> <app-path>${wf:appPath()}/promote_action_payload_for_dataset_table</app-path>
@ -150,7 +160,7 @@
</property> </property>
</configuration> </configuration>
</sub-workflow> </sub-workflow>
<ok to="PromoteActionPayloadForDatasourceTable"/> <ok to="JoinPromote"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -165,7 +175,7 @@
</property> </property>
</configuration> </configuration>
</sub-workflow> </sub-workflow>
<ok to="PromoteActionPayloadForOrganizationTable"/> <ok to="JoinPromote"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -180,7 +190,7 @@
</property> </property>
</configuration> </configuration>
</sub-workflow> </sub-workflow>
<ok to="PromoteActionPayloadForOtherResearchProductTable"/> <ok to="JoinPromote"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -195,7 +205,7 @@
</property> </property>
</configuration> </configuration>
</sub-workflow> </sub-workflow>
<ok to="PromoteActionPayloadForProjectTable"/> <ok to="JoinPromote"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -210,7 +220,7 @@
</property> </property>
</configuration> </configuration>
</sub-workflow> </sub-workflow>
<ok to="PromoteActionPayloadForPublicationTable"/> <ok to="JoinPromote"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -225,7 +235,7 @@
</property> </property>
</configuration> </configuration>
</sub-workflow> </sub-workflow>
<ok to="PromoteActionPayloadForRelationTable"/> <ok to="JoinPromote"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -240,7 +250,7 @@
</property> </property>
</configuration> </configuration>
</sub-workflow> </sub-workflow>
<ok to="PromoteActionPayloadForSoftwareTable"/> <ok to="JoinPromote"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -255,9 +265,11 @@
</property> </property>
</configuration> </configuration>
</sub-workflow> </sub-workflow>
<ok to="End"/> <ok to="JoinPromote"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="JoinPromote" to="End"/>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -95,7 +95,6 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -103,7 +103,6 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -156,12 +155,11 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=8000 --conf spark.sql.shuffle.partitions=2560
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg> <arg>--inputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>

View File

@ -95,7 +95,6 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -103,12 +103,11 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000 --conf spark.sql.shuffle.partitions=7000
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg> <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -157,12 +156,11 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000 --conf spark.sql.shuffle.partitions=7000
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg> <arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>

View File

@ -95,12 +95,11 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000 --conf spark.sql.shuffle.partitions=10000
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg> <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>

View File

@ -103,7 +103,6 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -156,12 +155,11 @@
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=4000 --conf spark.sql.shuffle.partitions=2560
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${workingDir}/software</arg> <arg>--inputGraphTablePath</arg><arg>${workingDir}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>

View File

@ -1,210 +0,0 @@
/*
* Copyright (c) 2024.
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package eu.dnetlib.dhp.actionmanager.promote;
import static eu.dnetlib.dhp.common.FunctionalInterfaceSupport.*;
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
import static org.apache.spark.sql.functions.*;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
public class PromoteResultWithMeasuresTest {
private static final Logger log = LoggerFactory.getLogger(PromoteResultWithMeasuresTest.class);
private static SparkSession spark;
private static Path tempDir;
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@BeforeAll
public static void beforeAll() throws IOException {
tempDir = Files.createTempDirectory(PromoteResultWithMeasuresTest.class.getSimpleName());
log.info("using work dir {}", tempDir);
SparkConf conf = new SparkConf();
conf.setMaster("local[*]");
conf.setAppName(PromoteResultWithMeasuresTest.class.getSimpleName());
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", tempDir.toString());
conf.set("hive.metastore.warehouse.dir", tempDir.resolve("warehouse").toString());
spark = SparkSession.builder().config(conf).getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
spark.stop();
FileUtils.deleteDirectory(tempDir.toFile());
}
@Test
void testPromoteResultWithMeasures_job() throws Exception {
final String inputGraphTablePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/promote/measures/graph")
.getPath();
final String inputActionPayloadPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/promote/measures/actionPayloads")
.getPath();
final String actionPayloadsPath = tempDir.resolve("actionPayloads").toString();
spark
.read()
.text(inputActionPayloadPath)
.withColumn("payload", col("value"))
.select("payload")
.write()
.parquet(actionPayloadsPath);
final Path outputGraphTablePath = tempDir.resolve("outputGraphTablePath");
PromoteActionPayloadForGraphTableJob
.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--graphTableClassName", Publication.class.getCanonicalName(),
"--inputGraphTablePath", inputGraphTablePath,
"--inputActionPayloadPath", actionPayloadsPath,
"--actionPayloadClassName", Result.class.getCanonicalName(),
"--outputGraphTablePath", outputGraphTablePath.toString(),
"--mergeAndGetStrategy", MergeAndGet.Strategy.MERGE_FROM_AND_GET.toString(),
"--promoteActionStrategy", PromoteAction.Strategy.ENRICH.toString(),
"--shouldGroupById", "true"
});
assertFalse(isDirEmpty(outputGraphTablePath));
final Encoder<Publication> pubEncoder = Encoders.bean(Publication.class);
List<Publication> results = spark
.read()
.schema(pubEncoder.schema())
.json(outputGraphTablePath.toString())
.as(pubEncoder)
.collectAsList();
verify(results);
}
@Test
void testPromoteResultWithMeasures_internal() throws JsonProcessingException {
Dataset<Publication> rowDS = spark
.read()
.schema(Encoders.bean(Publication.class).schema())
.json("src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/graph")
.as(Encoders.bean(Publication.class));
Dataset<Result> actionPayloadDS = spark
.read()
.schema(Encoders.bean(Result.class).schema())
.json("src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/actionPayloads")
.as(Encoders.bean(Result.class));
final MergeAndGet.Strategy mergeFromAndGet = MergeAndGet.Strategy.MERGE_FROM_AND_GET;
final SerializableSupplier<Function<Publication, String>> rowIdFn = ModelSupport::idFn;
final SerializableSupplier<BiFunction<Publication, Result, Publication>> mergeAndGetFn = MergeAndGet
.functionFor(mergeFromAndGet);
final SerializableSupplier<Publication> zeroFn = () -> Publication.class
.cast(new eu.dnetlib.dhp.schema.oaf.Publication());
final SerializableSupplier<Function<Publication, Boolean>> isNotZeroFn = PromoteResultWithMeasuresTest::isNotZeroFnUsingIdOrSourceAndTarget;
Dataset<Publication> joinedResults = PromoteActionPayloadFunctions
.joinGraphTableWithActionPayloadAndMerge(
rowDS,
actionPayloadDS,
rowIdFn,
ModelSupport::idFn,
mergeAndGetFn,
PromoteAction.Strategy.ENRICH,
Publication.class,
Result.class);
SerializableSupplier<BiFunction<Publication, Publication, Publication>> mergeRowsAndGetFn = MergeAndGet
.functionFor(mergeFromAndGet);
Dataset<Publication> mergedResults = PromoteActionPayloadFunctions
.groupGraphTableByIdAndMerge(
joinedResults, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, Publication.class);
verify(mergedResults.collectAsList());
}
private static void verify(List<Publication> results) throws JsonProcessingException {
assertNotNull(results);
assertEquals(1, results.size());
Result r = results.get(0);
log.info(OBJECT_MAPPER.writeValueAsString(r));
assertNotNull(r.getMeasures());
assertFalse(r.getMeasures().isEmpty());
assertTrue(
r
.getMeasures()
.stream()
.map(Measure::getId)
.collect(Collectors.toCollection(HashSet::new))
.containsAll(
Lists
.newArrayList(
"downloads", "views", "influence", "popularity", "influence_alt", "popularity_alt",
"impulse")));
}
private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSourceAndTarget() {
return t -> {
if (isSubClass(t, Relation.class)) {
final Relation rel = (Relation) t;
return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget());
}
return StringUtils.isNotBlank(((OafEntity) t).getId());
};
}
private static boolean isDirEmpty(final Path directory) throws IOException {
try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(directory)) {
return !dirStream.iterator().hasNext();
}
}
}

View File

@ -1,3 +0,0 @@
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|doi_dedup___::02317b7093277ec8aa0311d5c6a25b9b","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":[{"id":"downloads","unit":[{"key":"opendoar____::358aee4cc897452c00244351e4d91f69||ZENODO","value":"125","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:usage_counts","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"views","unit":[{"key":"opendoar____::358aee4cc897452c00244351e4d91f69||ZENODO","value":"35","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:usage_counts","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]}],"context":null,"processingchargeamount":null,"processingchargecurrency":null,"author":null,"resulttype":null,"metaResourceType":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"externalReference":null,"instance":null,"eoscifguidelines":null,"openAccessColor":null,"publiclyFunded":null,"transformativeAgreement":null,"isGreen":null,"isInDiamondJournal":null}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|doi_dedup___::02317b7093277ec8aa0311d5c6a25b9b","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":[{"id":"influence","unit":[{"key":"score","value":"3.1167566E-9","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C5","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity","unit":[{"key":"score","value":"7.335433E-9","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"influence_alt","unit":[{"key":"score","value":"4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C5","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity_alt","unit":[{"key":"score","value":"2.96","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"impulse","unit":[{"key":"score","value":"4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C5","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]}],"context":null,"processingchargeamount":null,"processingchargecurrency":null,"author":null,"resulttype":null,"metaResourceType":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"externalReference":null,"instance":null,"eoscifguidelines":null,"openAccessColor":null,"publiclyFunded":null,"transformativeAgreement":null,"isGreen":null,"isInDiamondJournal":null}
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|doi_dedup___::02317b7093277ec8aa0311d5c6a25b9b","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"context":null,"processingchargeamount":null,"processingchargecurrency":null,"author":null,"resulttype":null,"metaResourceType":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"externalReference":null,"instance":null,"eoscifguidelines":null,"openAccessColor":"hybrid","publiclyFunded":false,"transformativeAgreement":null,"isGreen":true,"isInDiamondJournal":false}

View File

@ -42,9 +42,6 @@ public class Constants {
public static final String NULL = "NULL"; public static final String NULL = "NULL";
public static final String NA = "N/A"; public static final String NA = "N/A";
public static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
public static final String WEB_CRAWL_NAME = "Web Crawl";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Constants() { private Constants() {

View File

@ -9,7 +9,7 @@ import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
@ -28,7 +28,6 @@ import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.DoiCleaningRule;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2; import scala.Tuple2;
@ -41,13 +40,9 @@ public class PrepareAffiliationRelations implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class); private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ID_PREFIX = "50|doi_________::"; private static final String ID_PREFIX = "50|doi_________::";
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference"; public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference";
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE"; public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!";
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation"; public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref";
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
public static final String DOI_URL_PREFIX = "https://doi.org/";
public static final int DOI_URL_PREFIX_LENGTH = 16;
public static <I extends Result> void main(String[] args) throws Exception { public static <I extends Result> void main(String[] args) throws Exception {
@ -75,12 +70,6 @@ public class PrepareAffiliationRelations implements Serializable {
final String dataciteInputPath = parser.get("dataciteInputPath"); final String dataciteInputPath = parser.get("dataciteInputPath");
log.info("dataciteInputPath: {}", dataciteInputPath); log.info("dataciteInputPath: {}", dataciteInputPath);
final String webcrawlInputPath = parser.get("webCrawlInputPath");
log.info("webcrawlInputPath: {}", webcrawlInputPath);
final String publisherInputPath = parser.get("publisherInputPath");
log.info("publisherInputPath: {}", publisherInputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
@ -91,72 +80,35 @@ public class PrepareAffiliationRelations implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Constants.removeOutputDir(spark, outputPath); Constants.removeOutputDir(spark, outputPath);
createActionSet(
spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath,
publisherInputPath, outputPath);
});
}
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath, List<KeyValue> collectedFromCrossref = OafMapperUtils
String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath, .listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
String outputPath) { JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
List<KeyValue> collectedfromOpenAIRE = OafMapperUtils spark, crossrefInputPath, collectedFromCrossref);
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
spark, crossrefInputPath, collectedfromOpenAIRE);
List<KeyValue> collectedFromPubmed = OafMapperUtils
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations( JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
spark, pubmedInputPath, collectedfromOpenAIRE); spark, pubmedInputPath, collectedFromPubmed);
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel( List<KeyValue> collectedFromOpenAPC = OafMapperUtils
spark, openapcInputPath, collectedfromOpenAIRE); .listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
spark, openapcInputPath, collectedFromOpenAPC);
List<KeyValue> collectedFromDatacite = OafMapperUtils
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations( JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedfromOpenAIRE); spark, dataciteInputPath, collectedFromDatacite);
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
spark, webcrawlInputPath, collectedfromOpenAIRE);
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
spark, publisherlInputPath, collectedfromOpenAIRE);
crossrefRelations crossrefRelations
.union(pubmedRelations) .union(pubmedRelations)
.union(openAPCRelations) .union(openAPCRelations)
.union(dataciteRelations) .union(dataciteRelations)
.union(webCrawlRelations)
.union(publisherRelations)
.saveAsHadoopFile( .saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
String inputPath,
List<KeyValue> collectedfrom) {
Dataset<Row> df = spark
.read()
.schema(
"`DOI` STRING, `Organizations` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
}
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
List<KeyValue> collectedfrom) {
Dataset<Row> df = spark
.read()
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
});
} }
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark, private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
@ -167,27 +119,8 @@ public class PrepareAffiliationRelations implements Serializable {
Dataset<Row> df = spark Dataset<Row> df = spark
.read() .read()
.schema("`DOI` STRING, `Matchings` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>") .schema("`DOI` STRING, `Matchings` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
.json(inputPath) .json(inputPath);
.where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df);
}
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
String inputPath,
List<KeyValue> collectedfrom) {
// load and parse affiliation relations from HDFS
Dataset<Row> df = spark
.read()
.schema(
"`DOI` STRING, `Matchings` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDDNew(collectedfrom, df);
}
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
// unroll nested arrays // unroll nested arrays
df = df df = df
.withColumn("matching", functions.explode(new Column("Matchings"))) .withColumn("matching", functions.explode(new Column("Matchings")))
@ -203,7 +136,7 @@ public class PrepareAffiliationRelations implements Serializable {
// DOI to OpenAIRE id // DOI to OpenAIRE id
final String paperId = ID_PREFIX final String paperId = ID_PREFIX
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi")))); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
// ROR id to OpenAIRE id // ROR id to OpenAIRE id
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid")); final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
@ -235,69 +168,6 @@ public class PrepareAffiliationRelations implements Serializable {
new Text(OBJECT_MAPPER.writeValueAsString(aa)))); new Text(OBJECT_MAPPER.writeValueAsString(aa))));
} }
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df) {
// unroll nested arrays
df = df
.withColumn("matching", functions.explode(new Column("Matchings")))
.select(
new Column("DOI").as("doi"),
new Column("matching.PID").as("pidtype"),
new Column("matching.Value").as("pidvalue"),
new Column("matching.Confidence").as("confidence"),
new Column("matching.Status").as("status"))
.where("status = 'active'");
// prepare action sets for affiliation relations
return df
.toJavaRDD()
.flatMap((FlatMapFunction<Row, Relation>) row -> {
// DOI to OpenAIRE id
final String paperId = ID_PREFIX
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
// Organization to OpenAIRE identifier
String affId = null;
if (row.getAs("pidtype").equals("ROR"))
// ROR id to OpenIARE id
affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("pidvalue"));
else
// getting the OpenOrgs identifier for the organization
affId = row.getAs("pidvalue");
Qualifier qualifier = OafMapperUtils
.qualifier(
BIP_AFFILIATIONS_CLASSID,
BIP_AFFILIATIONS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS);
// format data info; setting `confidence` into relation's `trust`
DataInfo dataInfo = OafMapperUtils
.dataInfo(
false,
BIP_INFERENCE_PROVENANCE,
true,
false,
qualifier,
Double.toString(row.getAs("confidence")));
// return bi-directional relations
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
})
.map(p -> new AtomicAction(Relation.class, p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
}
private static String removePrefix(String doi) {
if (doi.startsWith(DOI_URL_PREFIX))
return doi.substring(DOI_URL_PREFIX_LENGTH);
return doi;
}
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom, private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
DataInfo dataInfo) { DataInfo dataInfo) {
return Arrays return Arrays

View File

@ -10,7 +10,6 @@ import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -84,7 +83,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
resultsRDD resultsRDD
.union(projectsRDD) .union(projectsRDD)
.saveAsHadoopFile( .saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}); });
} }

View File

@ -80,11 +80,9 @@ public class PrepareFOSSparkJob implements Serializable {
fosDataset fosDataset
.groupByKey((MapFunction<FOSDataModel, String>) v -> v.getOaid().toLowerCase(), Encoders.STRING()) .groupByKey((MapFunction<FOSDataModel, String>) v -> v.getOaid().toLowerCase(), Encoders.STRING())
.mapGroups( .mapGroups((MapGroupsFunction<String, FOSDataModel, Result>) (k, it) -> {
(MapGroupsFunction<String, FOSDataModel, Result>) (k, return getResult(ModelSupport.getIdPrefix(Result.class) + "|" + k, it);
it) -> getResult( }, Encoders.bean(Result.class))
ModelSupport.entityIdPrefix.get(Result.class.getSimpleName().toLowerCase()) + "|" + k, it),
Encoders.bean(Result.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
@ -115,7 +113,19 @@ public class PrepareFOSSparkJob implements Serializable {
.forEach( .forEach(
l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true))); l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
r.setSubject(sbjs); r.setSubject(sbjs);
r
.setDataInfo(
OafMapperUtils
.dataInfo(
false, null, true,
false,
OafMapperUtils
.qualifier(
ModelConstants.PROVENANCE_ENRICH,
null,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
null));
return r; return r;
} }

View File

@ -6,23 +6,26 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset;
import org.jetbrains.annotations.NotNull; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject; import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareSDGSparkJob implements Serializable { public class PrepareSDGSparkJob implements Serializable {
@ -49,91 +52,54 @@ public class PrepareSDGSparkJob implements Serializable {
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final Boolean distributeDOI = Optional
.ofNullable(parser.get("distributeDoi"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("distribute doi {}", distributeDOI);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (distributeDOI)
doPrepare( doPrepare(
spark, spark,
sourcePath, sourcePath,
outputPath); outputPath);
else
doPrepareoaid(spark, sourcePath, outputPath);
}); });
} }
private static void doPrepare(SparkSession spark, String sourcePath, String outputPath) { private static void doPrepare(SparkSession spark, String sourcePath, String outputPath) {
Dataset<Row> sdgDataset = spark Dataset<SDGDataModel> sdgDataset = readPath(spark, sourcePath, SDGDataModel.class);
.read()
.format("csv")
.option("sep", DEFAULT_DELIMITER)
.option("inferSchema", "true")
.option("header", "true")
.option("quotes", "\"")
.load(sourcePath);
sdgDataset sdgDataset
.groupByKey((MapFunction<Row, String>) v -> ((String) v.getAs("doi")).toLowerCase(), Encoders.STRING()) .groupByKey((MapFunction<SDGDataModel, String>) r -> r.getDoi().toLowerCase(), Encoders.STRING())
.mapGroups( .mapGroups((MapGroupsFunction<String, SDGDataModel, Result>) (k, it) -> {
(MapGroupsFunction<String, Row, Result>) (k,
it) -> getResult(
DHPUtils
.generateUnresolvedIdentifier(
ModelSupport.entityIdPrefix.get(Result.class.getSimpleName().toLowerCase()) + "|" + k,
DOI),
it),
Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/sdg");
}
private static void doPrepareoaid(SparkSession spark, String sourcePath, String outputPath) {
Dataset<Row> sdgDataset = spark
.read()
.format("csv")
.option("sep", DEFAULT_DELIMITER)
.option("inferSchema", "true")
.option("header", "true")
.option("quotes", "\"")
.load(sourcePath);
;
sdgDataset
.groupByKey((MapFunction<Row, String>) r -> "50|" + ((String) r.getAs("oaid")), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Row, Result>) PrepareSDGSparkJob::getResult, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/sdg");
}
private static @NotNull Result getResult(String id, Iterator<Row> it) {
Result r = new Result(); Result r = new Result();
r.setId(id); r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
Row first = it.next(); SDGDataModel first = it.next();
List<Subject> sbjs = new ArrayList<>(); List<Subject> sbjs = new ArrayList<>();
sbjs.add(getSubject(first.getAs("sdg"), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)); sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
it it
.forEachRemaining( .forEachRemaining(
s -> sbjs s -> sbjs
.add(getSubject(s.getAs("sdg"), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID))); .add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
r.setSubject(sbjs); r.setSubject(sbjs);
r
.setDataInfo(
OafMapperUtils
.dataInfo(
false, null, true,
false,
OafMapperUtils
.qualifier(
ModelConstants.PROVENANCE_ENRICH,
null,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
null));
return r; return r;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/sdg");
} }
} }

View File

@ -13,6 +13,9 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
@ -21,9 +24,13 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.*;
import scala.Tuple2; import scala.Tuple2;
public class CreateActionSetSparkJob implements Serializable { public class CreateActionSetSparkJob implements Serializable {

View File

@ -49,9 +49,6 @@ public class ReadCOCI implements Serializable {
final String workingPath = parser.get("inputPath"); final String workingPath = parser.get("inputPath");
log.info("workingPath {}", workingPath); log.info("workingPath {}", workingPath);
final String backupPath = parser.get("backupPath");
log.info("backupPath {}", backupPath);
SparkConf sconf = new SparkConf(); SparkConf sconf = new SparkConf();
Configuration conf = new Configuration(); Configuration conf = new Configuration();
@ -71,14 +68,12 @@ public class ReadCOCI implements Serializable {
workingPath, workingPath,
fileSystem, fileSystem,
outputPath, outputPath,
backupPath,
delimiter); delimiter);
}); });
} }
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem, private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
String outputPath, String outputPath,
String backupPath,
String delimiter) throws IOException { String delimiter) throws IOException {
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles( .listFiles(
@ -112,8 +107,7 @@ public class ReadCOCI implements Serializable {
.mode(SaveMode.Append) .mode(SaveMode.Append)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
fileSystem.rename(fileStatus.getPath(), new Path("/tmp/miriam/OC/DONE"));
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
} }
} }

View File

@ -1,80 +0,0 @@
package eu.dnetlib.dhp.actionmanager.personentity;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Person;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
public class CoAuthorshipIterator implements Iterator<Relation> {
private int firstIndex;
private int secondIndex;
private boolean firstRelation;
private List<String> authors;
private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______::";
private static final String OPENAIRE_PREFIX = "openaire____";
private static final String SEPARATOR = "::";
private static final String ORCID_KEY = "10|" + OPENAIRE_PREFIX + SEPARATOR
+ DHPUtils.md5(ModelConstants.ORCID.toLowerCase());
public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
@Override
public boolean hasNext() {
return firstIndex < authors.size() - 1;
}
@Override
public Relation next() {
Relation rel = null;
if (firstRelation) {
rel = getRelation(authors.get(firstIndex), authors.get(secondIndex));
firstRelation = Boolean.FALSE;
} else {
rel = getRelation(authors.get(secondIndex), authors.get(firstIndex));
firstRelation = Boolean.TRUE;
secondIndex += 1;
if (secondIndex >= authors.size()) {
firstIndex += 1;
secondIndex = firstIndex + 1;
}
}
return rel;
}
public CoAuthorshipIterator(List<String> authors) {
this.authors = authors;
this.firstIndex = 0;
this.secondIndex = 1;
this.firstRelation = Boolean.TRUE;
}
private Relation getRelation(String orcid1, String orcid2) {
String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
return OafMapperUtils
.getRelation(
source, target, ModelConstants.PERSON_PERSON_RELTYPE,
ModelConstants.PERSON_PERSON_SUBRELTYPE,
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
Arrays.asList(OafMapperUtils.keyValue(ORCID_KEY, ModelConstants.ORCID_DS)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.91"),
null);
}
}

View File

@ -1,20 +0,0 @@
package eu.dnetlib.dhp.actionmanager.personentity;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class Coauthors implements Serializable {
private List<String> coauthors;
public List<String> getCoauthors() {
return coauthors;
}
public void setCoauthors(List<String> coauthors) {
this.coauthors = coauthors;
}
}

View File

@ -1,40 +0,0 @@
package eu.dnetlib.dhp.actionmanager.personentity;
import java.io.Serializable;
import eu.dnetlib.dhp.schema.oaf.Person;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class Couples implements Serializable {
Person p;
Relation r;
public Couples() {
}
public Person getP() {
return p;
}
public void setP(Person p) {
this.p = p;
}
public Relation getR() {
return r;
}
public void setR(Relation r) {
this.r = r;
}
public static <Tuples> Couples newInstance(Tuple2<Person, Relation> couple) {
Couples c = new Couples();
c.p = couple._1();
c.r = couple._2();
return c;
}
}

View File

@ -1,437 +0,0 @@
package eu.dnetlib.dhp.actionmanager.personentity;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.*;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.*;
import org.apache.spark.sql.*;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.spark_project.jetty.util.StringUtil;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.orcid.model.Author;
import eu.dnetlib.dhp.collection.orcid.model.Employment;
import eu.dnetlib.dhp.collection.orcid.model.Work;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Person;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
public class ExtractPerson implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String OPENAIRE_PREFIX = "openaire____";
private static final String SEPARATOR = "::";
private static final String orcidKey = "10|" + OPENAIRE_PREFIX + SEPARATOR
+ DHPUtils.md5(ModelConstants.ORCID.toLowerCase());
private static final String DOI_PREFIX = "50|doi_________::";
private static final String PMID_PREFIX = "50|pmid________::";
private static final String ARXIV_PREFIX = "50|arXiv_______::";
private static final String PMCID_PREFIX = "50|pmcid_______::";
private static final String ROR_PREFIX = "20|ror_________::";
private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
ExtractPerson.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json"))));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir {}", workingDir);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
createActionSet(spark, inputPath, outputPath, workingDir);
});
}
private static void createActionSet(SparkSession spark, String inputPath, String outputPath, String workingDir) {
Dataset<Author> authors = spark
.read()
.parquet(inputPath + "Authors")
.as(Encoders.bean(Author.class));
Dataset<Work> works = spark
.read()
.parquet(inputPath + "Works")
.as(Encoders.bean(Work.class))
.filter(
(FilterFunction<Work>) w -> Optional.ofNullable(w.getPids()).isPresent() &&
w
.getPids()
.stream()
.anyMatch(
p -> p.getSchema().equalsIgnoreCase("doi") ||
p.getSchema().equalsIgnoreCase("pmc") ||
p.getSchema().equalsIgnoreCase("pmid") ||
p.getSchema().equalsIgnoreCase("arxiv")));
Dataset<Employment> employmentDataset = spark
.read()
.parquet(inputPath + "Employments")
.as(Encoders.bean(Employment.class));
Dataset<Author> peopleToMap = authors
.joinWith(works, authors.col("orcid").equalTo(works.col("orcid")))
.map((MapFunction<Tuple2<Author, Work>, Author>) t2 -> t2._1(), Encoders.bean(Author.class))
.groupByKey((MapFunction<Author, String>) a -> a.getOrcid(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Author, Author>) (k, it) -> it.next(), Encoders.bean(Author.class));
Dataset<Employment> employment = employmentDataset
.joinWith(peopleToMap, employmentDataset.col("orcid").equalTo(peopleToMap.col("orcid")))
.map((MapFunction<Tuple2<Employment, Author>, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class));
Dataset<Person> people;
peopleToMap.map((MapFunction<Author, Person>) op -> {
Person person = new Person();
person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX));
person
.setBiography(
Optional
.ofNullable(op.getBiography())
.orElse(""));
KeyValue kv = OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS);
kv.setDataInfo(null);
person.setCollectedfrom(Arrays.asList(kv));
person
.setAlternativeNames(
Optional
.ofNullable(op.getOtherNames())
.orElse(new ArrayList<>()));
person
.setFamilyName(
Optional
.ofNullable(op.getFamilyName())
.orElse(""));
person
.setGivenName(
Optional
.ofNullable(op.getGivenName())
.orElse(""));
person
.setPid(
Optional
.ofNullable(op.getOtherPids())
.map(
v -> v
.stream()
.map(
p -> OafMapperUtils
.structuredProperty(
p.getValue(), p.getSchema(), p.getSchema(), ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES, null))
.collect(Collectors.toList()))
.orElse(new ArrayList<>()));
person
.getPid()
.add(
OafMapperUtils
.structuredProperty(
op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null));
person.setDateofcollection(op.getLastModifiedDate());
person.setOriginalId(Arrays.asList(op.getOrcid()));
return person;
}, Encoders.bean(Person.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingDir + "/people");
works
.flatMap(
(FlatMapFunction<Work, Relation>) ExtractPerson::getAuthorshipRelationIterator,
Encoders.bean(Relation.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingDir + "/authorship");
Dataset<Relation> coauthorship = works
.flatMap((FlatMapFunction<Work, Tuple2<String, String>>) w -> {
List<Tuple2<String, String>> lista = new ArrayList<>();
w.getPids().stream().forEach(p -> {
if (p.getSchema().equalsIgnoreCase("doi") || p.getSchema().equalsIgnoreCase("pmc")
|| p.getSchema().equalsIgnoreCase("pmid") || p.getSchema().equalsIgnoreCase("arxiv"))
lista.add(new Tuple2<>(p.getValue(), w.getOrcid()));
});
return lista.iterator();
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.groupByKey((MapFunction<Tuple2<String, String>, String>) Tuple2::_1, Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Tuple2<String, String>, Coauthors>) (k, it) -> extractCoAuthors(it),
Encoders.bean(Coauthors.class))
.flatMap(
(FlatMapFunction<Coauthors, Relation>) c -> new CoAuthorshipIterator(c.getCoauthors()),
Encoders.bean(Relation.class))
.groupByKey((MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(), Encoders.bean(Relation.class));
coauthorship
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingDir + "/coauthorship");
employment
.filter((FilterFunction<Employment>) e -> Optional.ofNullable(e.getAffiliationId()).isPresent())
.filter((FilterFunction<Employment>) e -> e.getAffiliationId().getSchema().equalsIgnoreCase("ror"))
.map(
(MapFunction<Employment, Relation>) ExtractPerson::getAffiliationRelation,
Encoders.bean(Relation.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingDir + "/affiliation");
people = spark
.read()
.textFile(workingDir + "/people")
.map(
(MapFunction<String, Person>) value -> OBJECT_MAPPER
.readValue(value, Person.class),
Encoders.bean(Person.class));
people.show(false);
people
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.union(
getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
.union(
getRelations(spark, workingDir + "/coauthorship")
.toJavaRDD()
.map(r -> new AtomicAction(r.getClass(), r)))
.union(
getRelations(spark, workingDir + "/affiliation")
.toJavaRDD()
.map(r -> new AtomicAction(r.getClass(), r)))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
}
private static Dataset<Relation> getRelations(SparkSession spark, String path) {
return spark
.read()
.textFile(path)
.map(
(MapFunction<String, Relation>) value -> OBJECT_MAPPER
.readValue(value, Relation.class),
Encoders.bean(Relation.class));// spark.read().json(path).as(Encoders.bean(Relation.class));
}
private static Coauthors extractCoAuthors(Iterator<Tuple2<String, String>> it) {
Coauthors coauth = new Coauthors();
List<String> coauthors = new ArrayList<>();
while (it.hasNext())
coauthors.add(it.next()._2());
coauth.setCoauthors(coauthors);
return coauth;
}
private static Relation getAffiliationRelation(Employment row) {
String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid());
String target = ROR_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
List<KeyValue> properties = new ArrayList<>();
Relation relation = OafMapperUtils
.getRelation(
source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE,
ModelConstants.ORG_PERSON_PARTICIPATES,
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.91"),
null);
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
KeyValue kv = new KeyValue();
kv.setKey("startDate");
kv.setValue(row.getStartDate());
properties.add(kv);
}
if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) {
KeyValue kv = new KeyValue();
kv.setKey("endDate");
kv.setValue(row.getEndDate());
properties.add(kv);
}
if (properties.size() > 0)
relation.setProperties(properties);
return relation;
}
private static Collection<? extends Relation> getCoAuthorshipRelations(String orcid1, String orcid2) {
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid1);
String target = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid2);
return Arrays
.asList(
OafMapperUtils
.getRelation(
source, target, ModelConstants.PERSON_PERSON_RELTYPE,
ModelConstants.PERSON_PERSON_SUBRELTYPE,
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.91"),
null),
OafMapperUtils
.getRelation(
target, source, ModelConstants.PERSON_PERSON_RELTYPE,
ModelConstants.PERSON_PERSON_SUBRELTYPE,
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.91"),
null));
}
private static @NotNull Iterator<Relation> getAuthorshipRelationIterator(Work w) {
if (Optional.ofNullable(w.getPids()).isPresent())
return w
.getPids()
.stream()
.map(pid -> getRelation(w.getOrcid(), pid))
.filter(Objects::nonNull)
.collect(Collectors.toList())
.iterator();
List<Relation> ret = new ArrayList<>();
return ret.iterator();
}
private static Relation getRelation(String orcid, eu.dnetlib.dhp.collection.orcid.model.Pid pid) {
String target;
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
switch (pid.getSchema()) {
case "doi":
target = DOI_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), pid.getValue()));
break;
case "pmid":
target = PMID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), pid.getValue()));
break;
case "arxiv":
target = ARXIV_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.arXiv.toString(), pid.getValue()));
break;
case "pmcid":
target = PMCID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), pid.getValue()));
break;
default:
return null;
}
return OafMapperUtils
.getRelation(
source, target, ModelConstants.RESULT_PERSON_RELTYPE,
ModelConstants.RESULT_PERSON_SUBRELTYPE,
ModelConstants.RESULT_PERSON_HASAUTHORED,
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.91"),
null);
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.dhp.actionmanager.personentity;
import java.io.Serializable;
import java.util.ArrayList;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import eu.dnetlib.dhp.collection.orcid.model.Work;
public class WorkList implements Serializable {
private ArrayList<Work> workArrayList;
public ArrayList<Work> getWorkArrayList() {
return workArrayList;
}
public void setWorkArrayList(ArrayList<Work> workArrayList) {
this.workArrayList = workArrayList;
}
public WorkList() {
workArrayList = new ArrayList<>();
}
}

View File

@ -1,91 +0,0 @@
package eu.dnetlib.dhp.actionmanager.sdgnodoi;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Hdfs;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Result;
import scala.Tuple2;
public class CreateActionSetSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
CreateActionSetSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/fosnodoi/as_parameters.json"))));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
createActionSet(spark, inputPath, outputPath);
});
}
private static void createActionSet(SparkSession spark, String inputPath, String outputPath) {
spark
.read()
.textFile(inputPath)
.map(
(MapFunction<String, Result>) value -> OBJECT_MAPPER.readValue(value, Result.class),
Encoders.bean(Result.class))
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}
}

View File

@ -5,13 +5,13 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
@ -20,7 +20,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.Constants;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -44,7 +43,8 @@ public class CreateActionSetFromWebEntries implements Serializable {
private static final String PMID_PREFIX = "50|pmid________::"; private static final String PMID_PREFIX = "50|pmid________::";
private static final String PMCID_PREFIX = "50|pmc_________::"; private static final String PMCID_PREFIX = "50|pmc_________::";
private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
private static final String WEB_CRAWL_NAME = "Web Crawl";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -70,9 +70,6 @@ public class CreateActionSetFromWebEntries implements Serializable {
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final String blackListInputPath = parser.get("blackListPath");
log.info("blackListInputPath: {}", blackListInputPath);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
@ -80,29 +77,25 @@ public class CreateActionSetFromWebEntries implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
createActionSet(spark, inputPath, outputPath, blackListInputPath); createActionSet(spark, inputPath, outputPath );
}); });
} }
public static void createActionSet(SparkSession spark, String inputPath, public static void createActionSet(SparkSession spark, String inputPath,
String outputPath, String blackListInputPath) { String outputPath) {
final Dataset<Row> dataset = readWebCrawl(spark, inputPath) final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
.filter("country_code=='IE'") .filter("publication_year <= 2020 or country_code=='IE'")
.drop("publication_year"); .drop("publication_year");
final Dataset<Row> blackList = readBlackList(spark, blackListInputPath); dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {
dataset
.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
.drop("OpenAlexId")
.flatMap((FlatMapFunction<Row, Relation>) row -> {
List<Relation> ret = new ArrayList<>(); List<Relation> ret = new ArrayList<>();
final String ror = ROR_PREFIX final String ror = ROR_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror"))); + IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror)); ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
return ret return ret
.iterator(); .iterator();
@ -112,7 +105,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
.mapToPair( .mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa)))) new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
} }
@ -136,22 +129,13 @@ public class CreateActionSetFromWebEntries implements Serializable {
"institution", functions "institution", functions
.explode( .explode(
functions.col("institutions"))) functions.col("institutions")))
.selectExpr( .selectExpr(
"id", "doi", "institution.ror as ror", "id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
"institution.country_code as country_code", "publication_year") "institution.country_code as country_code", "publication_year")
.distinct(); .distinct();
} }
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
return spark
.read()
.json(inputPath)
.select("OpenAlexId");
}
private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) { private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
if (pmcid == null) if (pmcid == null)
return new ArrayList<>(); return new ArrayList<>();
@ -159,7 +143,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
return createAffiliatioRelationPair( return createAffiliatioRelationPair(
PMCID_PREFIX PMCID_PREFIX
+ IdentifierFactory + IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), removeResolver("PMC", pmcid))), .md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), removeResolver("PMC" , pmcid))),
ror); ror);
} }
@ -175,7 +159,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
} }
private static String removeResolver(String pidType, String pid) { private static String removeResolver(String pidType, String pid) {
switch (pidType) { switch (pidType){
case "PMID": case "PMID":
return pid.substring(33); return pid.substring(33);
case "PMC": case "PMC":
@ -195,7 +179,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
return createAffiliatioRelationPair( return createAffiliatioRelationPair(
DOI_PREFIX DOI_PREFIX
+ IdentifierFactory + IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), removeResolver("DOI", doi))), .md5(PidCleaner.normalizePidValue(PidType.doi.toString(), removeResolver("DOI" ,doi))),
ror); ror);
} }
@ -211,7 +195,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
ModelConstants.IS_AUTHOR_INSTITUTION_OF, ModelConstants.IS_AUTHOR_INSTITUTION_OF,
Arrays Arrays
.asList( .asList(
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)), OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
OafMapperUtils OafMapperUtils
.dataInfo( .dataInfo(
false, null, false, false, false, null, false, false,
@ -230,7 +214,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
ModelConstants.HAS_AUTHOR_INSTITUTION, ModelConstants.HAS_AUTHOR_INSTITUTION,
Arrays Arrays
.asList( .asList(
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)), OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
OafMapperUtils OafMapperUtils
.dataInfo( .dataInfo(
false, null, false, false, false, null, false, false,

View File

@ -1,158 +0,0 @@
package eu.dnetlib.dhp.actionmanager.webcrawl;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.*;
import java.io.File;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Optional;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.DirectoryFileFilter;
import org.apache.commons.io.filefilter.FileFileFilter;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import scala.Tuple2;
public class RemoveRelationFromActionSet
implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CreateActionSetFromWebEntries.class);
private static final ObjectMapper MAPPER = new ObjectMapper();
private static final StructType KV_SCHEMA = StructType$.MODULE$
.apply(
Arrays
.asList(
StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()),
StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())));
private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$
.apply(
Arrays
.asList(
StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()),
StructField$.MODULE$
.apply(
"payload", DataTypes.StringType, false, Metadata.empty())));
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CreateActionSetFromWebEntries.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
// the actionSet path
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String blackListInputPath = parser.get("blackListPath");
log.info("blackListInputPath: {}", blackListInputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeFromActionSet(spark, inputPath, outputPath, blackListInputPath);
});
}
private static void removeFromActionSet(SparkSession spark, String inputPath, String outputPath,
String blackListInputPath) {
// read the blacklist
Dataset<String> blackList = readBlackList(spark, blackListInputPath)
.map(
(MapFunction<Row, String>) r -> IdentifierFactory
.idFromPid("50", "doi", ((String) r.getAs("doi")).substring(16), true),
Encoders.STRING());
// read the old actionset and get the relations in the payload
JavaPairRDD<Text, Text> seq = JavaSparkContext
.fromSparkContext(spark.sparkContext())
.sequenceFile(inputPath, Text.class, Text.class);
JavaRDD<Row> rdd = seq
.map(x -> RowFactory.create(x._1().toString(), x._2().toString()));
Dataset<Row> actionSet = spark
.createDataFrame(rdd, KV_SCHEMA)
.withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA))
.select(expr("atomic_action.*"));
Dataset<Relation> relation = actionSet
.map(
(MapFunction<Row, Relation>) r -> MAPPER.readValue((String) r.getAs("payload"), Relation.class),
Encoders.bean(Relation.class));
// select only the relation not matching any pid in the blacklist as source for the relation
Dataset<Relation> relNoSource = relation
.joinWith(blackList, relation.col("source").equalTo(blackList.col("value")), "left")
.filter((FilterFunction<Tuple2<Relation, String>>) t2 -> t2._2() == null)
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class));
// select only the relation not matching any pid in the blacklist as target of the relation
relNoSource
.joinWith(blackList, relNoSource.col("target").equalTo(blackList.col("value")), "left")
.filter((FilterFunction<Tuple2<Relation, String>>) t2 -> t2._2() == null)
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class))
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
;
}
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
return spark
.read()
.json(inputPath)
.select("doi");
}
}

View File

@ -22,11 +22,9 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.base.BaseCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.base.BaseCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.gtr2.Gtr2PublicationsCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.osf.OsfPreprintsCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.CollectorException;
@ -60,7 +58,7 @@ public class CollectorWorker extends ReportingJob {
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException { public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
final String outputPath = this.mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME; final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
log.info("outputPath path is {}", outputPath); log.info("outputPath path is {}", outputPath);
final CollectorPlugin plugin = getCollectorPlugin(); final CollectorPlugin plugin = getCollectorPlugin();
@ -70,36 +68,36 @@ public class CollectorWorker extends ReportingJob {
try (SequenceFile.Writer writer = SequenceFile try (SequenceFile.Writer writer = SequenceFile
.createWriter( .createWriter(
this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer fileSystem.getConf(),
.keyClass(IntWritable.class), SequenceFile.Writer.file(new Path(outputPath)),
SequenceFile.Writer SequenceFile.Writer.keyClass(IntWritable.class),
.valueClass(Text.class), SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) { SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final IntWritable key = new IntWritable(counter.get()); final IntWritable key = new IntWritable(counter.get());
final Text value = new Text(); final Text value = new Text();
plugin plugin
.collect(this.api, this.report) .collect(api, report)
.forEach(content -> { .forEach(
content -> {
key.set(counter.getAndIncrement()); key.set(counter.getAndIncrement());
value.set(content); value.set(content);
try { try {
writer.append(key, value); writer.append(key, value);
} catch (final Throwable e) { } catch (Throwable e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
}); });
} catch (final Throwable e) { } catch (Throwable e) {
this.report.put(e.getClass().getName(), e.getMessage()); report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e); throw new CollectorException(e);
} finally { } finally {
shutdown(); shutdown();
this.report.ongoing(counter.longValue(), counter.longValue()); report.ongoing(counter.longValue(), counter.longValue());
} }
} }
private void scheduleReport(final AtomicInteger counter) { private void scheduleReport(AtomicInteger counter) {
schedule(new ReporterCallback() { schedule(new ReporterCallback() {
@Override @Override
public Long getCurrent() { public Long getCurrent() {
return counter.longValue(); return counter.longValue();
@ -114,37 +112,33 @@ public class CollectorWorker extends ReportingJob {
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException { private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
switch (CollectorPlugin.NAME.valueOf(this.api.getProtocol())) { switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
case oai: case oai:
return new OaiCollectorPlugin(this.clientParams); return new OaiCollectorPlugin(clientParams);
case rest_json2xml: case rest_json2xml:
return new RestCollectorPlugin(this.clientParams); return new RestCollectorPlugin(clientParams);
case file: case file:
return new FileCollectorPlugin(this.fileSystem); return new FileCollectorPlugin(fileSystem);
case fileGzip: case fileGzip:
return new FileGZipCollectorPlugin(this.fileSystem); return new FileGZipCollectorPlugin(fileSystem);
case baseDump: case baseDump:
return new BaseCollectorPlugin(this.fileSystem); return new BaseCollectorPlugin(this.fileSystem);
case gtr2Publications:
return new Gtr2PublicationsCollectorPlugin(this.clientParams);
case osfPreprints:
return new OsfPreprintsCollectorPlugin(this.clientParams);
case other: case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(this.api.getParams().get("other_plugin_type")) .ofNullable(api.getParams().get("other_plugin_type"))
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf) .map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
.orElseThrow(() -> new IllegalArgumentException("invalid other_plugin_type")); .orElseThrow(() -> new IllegalArgumentException("invalid other_plugin_type"));
switch (plugin) { switch (plugin) {
case mdstore_mongodb_dump: case mdstore_mongodb_dump:
return new MongoDbDumpCollectorPlugin(this.fileSystem); return new MongoDbDumpCollectorPlugin(fileSystem);
case mdstore_mongodb: case mdstore_mongodb:
return new MDStoreCollectorPlugin(); return new MDStoreCollectorPlugin();
default: default:
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin); throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
} }
default: default:
throw new UnknownCollectorPluginException("protocol is not managed: " + this.api.getProtocol()); throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
} }
} }

View File

@ -20,9 +20,6 @@ public class Author extends ORCIDItem {
private String lastModifiedDate; private String lastModifiedDate;
public Author() {
}
public String getBiography() { public String getBiography() {
return biography; return biography;
} }

View File

@ -11,7 +11,4 @@ public class ORCIDItem {
public void setOrcid(String orcid) { public void setOrcid(String orcid) {
this.orcid = orcid; this.orcid = orcid;
} }
public ORCIDItem() {
}
} }

View File

@ -32,6 +32,4 @@ public class Work extends ORCIDItem {
pids.add(pid); pids.add(pid);
} }
public Work() {
}
} }

View File

@ -11,7 +11,7 @@ public interface CollectorPlugin {
enum NAME { enum NAME {
oai, other, rest_json2xml, file, fileGzip, baseDump, gtr2Publications, osfPreprints; oai, other, rest_json2xml, file, fileGzip, baseDump;
public enum OTHER_NAME { public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb mdstore_mongodb_dump, mdstore_mongodb

View File

@ -1,43 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.gtr2;
import java.util.Iterator;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
public class Gtr2PublicationsCollectorPlugin implements CollectorPlugin {
private final HttpClientParams clientParams;
public Gtr2PublicationsCollectorPlugin(final HttpClientParams clientParams) {
this.clientParams = clientParams;
}
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
final String baseUrl = api.getBaseUrl();
final String startPage = api.getParams().get("startPage");
final String endPage = api.getParams().get("endPage");
final String fromDate = api.getParams().get("fromDate");
if ((fromDate != null) && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
}
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseUrl, fromDate, startPage, endPage,
this.clientParams);
final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
return StreamSupport.stream(spliterator, false);
}
}

Some files were not shown because too many files have changed in this diff Show More