merged with master

This commit is contained in:
Enrico Ottonello 2021-07-14 15:33:59 +02:00
commit 7840cc6526
627 changed files with 41716 additions and 18896 deletions

3
.gitignore vendored
View File

@ -7,6 +7,8 @@
*.iws *.iws
*~ *~
.vscode .vscode
.metals
.bloop
.classpath .classpath
/*/.classpath /*/.classpath
/*/*/.classpath /*/*/.classpath
@ -24,4 +26,5 @@
spark-warehouse spark-warehouse
/**/job-override.properties /**/job-override.properties
/**/*.log /**/*.log
/**/.factorypath

View File

@ -7,6 +7,7 @@
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.4-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath> <relativePath>../pom.xml</relativePath>
</parent> </parent>
<artifactId>dhp-common</artifactId> <artifactId>dhp-common</artifactId>
@ -20,6 +21,10 @@
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId> <artifactId>hadoop-common</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_2.11</artifactId>
@ -53,11 +58,6 @@
<groupId>com.fasterxml.jackson.core</groupId> <groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId> <artifactId>jackson-databind</artifactId>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
<dependency>
<groupId>com.rabbitmq</groupId>
<artifactId>amqp-client</artifactId>
</dependency>
<dependency> <dependency>
<groupId>net.sf.saxon</groupId> <groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId> <artifactId>Saxon-HE</artifactId>
@ -98,6 +98,16 @@
<artifactId>dnet-pace-core</artifactId> <artifactId>dnet-pace-core</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId> <artifactId>dhp-schemas</artifactId>

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.application;
import java.io.*;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import com.google.common.collect.Maps;
public class ApplicationUtils {
}

View File

@ -1,10 +1,7 @@
package eu.dnetlib.dhp.application; package eu.dnetlib.dhp.application;
import java.io.ByteArrayInputStream; import java.io.*;
import java.io.ByteArrayOutputStream;
import java.io.Serializable;
import java.io.StringWriter;
import java.util.*; import java.util.*;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream; import java.util.zip.GZIPOutputStream;
@ -12,17 +9,21 @@ import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.*; import org.apache.commons.cli.*;
import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
public class ArgumentApplicationParser implements Serializable { public class ArgumentApplicationParser implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
private final Options options = new Options(); private final Options options = new Options();
private final Map<String, String> objectMap = new HashMap<>(); private final Map<String, String> objectMap = new HashMap<>();
private final List<String> compressedValues = new ArrayList<>(); private final List<String> compressedValues = new ArrayList<>();
public ArgumentApplicationParser(final String json_configuration) throws Exception { public ArgumentApplicationParser(final String json_configuration) throws IOException {
final ObjectMapper mapper = new ObjectMapper(); final ObjectMapper mapper = new ObjectMapper();
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
createOptionMap(configuration); createOptionMap(configuration);
@ -33,7 +34,6 @@ public class ArgumentApplicationParser implements Serializable {
} }
private void createOptionMap(final OptionsParameter[] configuration) { private void createOptionMap(final OptionsParameter[] configuration) {
Arrays Arrays
.stream(configuration) .stream(configuration)
.map( .map(
@ -47,10 +47,6 @@ public class ArgumentApplicationParser implements Serializable {
return o; return o;
}) })
.forEach(options::addOption); .forEach(options::addOption);
// HelpFormatter formatter = new HelpFormatter();
// formatter.printHelp("myapp", null, options, null, true);
} }
public static String decompressValue(final String abstractCompressed) { public static String decompressValue(final String abstractCompressed) {
@ -61,7 +57,7 @@ public class ArgumentApplicationParser implements Serializable {
IOUtils.copy(gis, stringWriter); IOUtils.copy(gis, stringWriter);
return stringWriter.toString(); return stringWriter.toString();
} catch (Throwable e) { } catch (Throwable e) {
System.out.println("Wrong value to decompress:" + abstractCompressed); log.error("Wrong value to decompress:" + abstractCompressed);
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
@ -74,7 +70,7 @@ public class ArgumentApplicationParser implements Serializable {
return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
} }
public void parseArgument(final String[] args) throws Exception { public void parseArgument(final String[] args) throws ParseException {
CommandLineParser parser = new BasicParser(); CommandLineParser parser = new BasicParser();
CommandLine cmd = parser.parse(options, args); CommandLine cmd = parser.parse(options, args);
Arrays Arrays

View File

@ -1,5 +1,5 @@
package eu.dnetlib.collector.worker.model; package eu.dnetlib.dhp.collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;

View File

@ -27,4 +27,26 @@ public class Constants {
coarCodeLabelMap.put("c_f1cf", "EMBARGO"); coarCodeLabelMap.put("c_f1cf", "EMBARGO");
} }
public static final String SEQUENCE_FILE_NAME = "/sequence_file";
public static final String REPORT_FILE_NAME = "/report";
public static final String MDSTORE_DATA_PATH = "/store";
public static final String MDSTORE_SIZE_PATH = "/size";
public static final String COLLECTION_MODE = "collectionMode";
public static final String METADATA_ENCODING = "metadataEncoding";
public static final String OOZIE_WF_PATH = "oozieWfPath";
public static final String DNET_MESSAGE_MGR_URL = "dnetMessageManagerURL";
public static final String MAX_NUMBER_OF_RETRY = "maxNumberOfRetry";
public static final String REQUEST_DELAY = "requestDelay";
public static final String RETRY_DELAY = "retryDelay";
public static final String CONNECT_TIMEOUT = "connectTimeOut";
public static final String READ_TIMEOUT = "readTimeOut";
public static final String FROM_DATE_OVERRIDE = "fromDateOverride";
public static final String UNTIL_DATE_OVERRIDE = "untilDateOverride";
public static final String CONTENT_TOTALITEMS = "TotalItems";
public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems";
} }

View File

@ -14,7 +14,7 @@ public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class); private static final Log log = LogFactory.getLog(DbClient.class);
private Connection connection; private final Connection connection;
public DbClient(final String address, final String login, final String password) { public DbClient(final String address, final String login, final String password) {

View File

@ -100,7 +100,7 @@ public class MakeTarArchive implements Serializable {
BufferedInputStream bis = new BufferedInputStream(is); BufferedInputStream bis = new BufferedInputStream(is);
int count; int count;
byte data[] = new byte[1024]; byte[] data = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) { while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count); ar.write(data, 0, count);
} }

View File

@ -1,39 +1,60 @@
package eu.dnetlib.dhp.oa.graph.raw.common; package eu.dnetlib.dhp.common;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Optional;
import java.util.stream.StreamSupport; import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.bson.Document; import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import com.mongodb.BasicDBObject;
import com.mongodb.MongoClient; import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI; import com.mongodb.MongoClientURI;
import com.mongodb.QueryBuilder;
import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase; import com.mongodb.client.MongoDatabase;
public class MdstoreClient implements Closeable { public class MdstoreClient implements Closeable {
private static final Logger log = LoggerFactory.getLogger(MdstoreClient.class);
private final MongoClient client; private final MongoClient client;
private final MongoDatabase db; private final MongoDatabase db;
private static final String COLL_METADATA = "metadata"; private static final String COLL_METADATA = "metadata";
private static final String COLL_METADATA_MANAGER = "metadataManager"; private static final String COLL_METADATA_MANAGER = "metadataManager";
private static final Log log = LogFactory.getLog(MdstoreClient.class);
public MdstoreClient(final String baseUrl, final String dbName) { public MdstoreClient(final String baseUrl, final String dbName) {
this.client = new MongoClient(new MongoClientURI(baseUrl)); this.client = new MongoClient(new MongoClientURI(baseUrl));
this.db = getDb(client, dbName); this.db = getDb(client, dbName);
} }
public MongoCollection<Document> mdStore(final String mdId) {
BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();
log.info("querying current mdId: {}", query.toJson());
final String currentId = Optional
.ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query))
.map(r -> r.first())
.map(d -> d.getString("currentId"))
.orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId));
log.info("currentId: {}", currentId);
return getColl(db, currentId, true);
}
public Map<String, String> validCollections( public Map<String, String> validCollections(
final String mdFormat, final String mdLayout, final String mdInterpretation) { final String mdFormat, final String mdLayout, final String mdInterpretation) {

View File

@ -13,9 +13,9 @@ import okio.Source;
public class InputStreamRequestBody extends RequestBody { public class InputStreamRequestBody extends RequestBody {
private InputStream inputStream; private final InputStream inputStream;
private MediaType mediaType; private final MediaType mediaType;
private long lenght; private final long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) { public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {

View File

@ -0,0 +1,72 @@
package eu.dnetlib.dhp.common.rest;
import java.util.Arrays;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class DNetRestClient {
private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
private static final ObjectMapper mapper = new ObjectMapper();
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet, clazz);
}
public static String doGET(final String url) throws Exception {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet);
}
public static <V> String doPOST(final String url, V objParam) throws Exception {
final HttpPost httpPost = new HttpPost(url);
if (objParam != null) {
final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam));
httpPost.setEntity(entity);
httpPost.setHeader("Accept", "application/json");
httpPost.setHeader("Content-type", "application/json");
}
return doHTTPRequest(httpPost);
}
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws Exception {
return mapper.readValue(doPOST(url, objParam), clazz);
}
private static String doHTTPRequest(final HttpUriRequest r) throws Exception {
CloseableHttpClient client = HttpClients.createDefault();
log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString());
log
.info(
"request headers: {}",
Arrays
.asList(r.getAllHeaders())
.stream()
.map(h -> h.getName() + ":" + h.getValue())
.collect(Collectors.joining(",")));
CloseableHttpResponse response = client.execute(r);
return IOUtils.toString(response.getEntity().getContent());
}
private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {
return mapper.readValue(doHTTPRequest(r), clazz);
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.raw.common; package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
@ -10,8 +10,8 @@ import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class Vocabulary implements Serializable { public class Vocabulary implements Serializable {

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.raw.common; package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
@ -7,8 +7,8 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -67,6 +67,10 @@ public class VocabularyGroup implements Serializable {
private final Map<String, Vocabulary> vocs = new HashMap<>(); private final Map<String, Vocabulary> vocs = new HashMap<>();
public Set<String> vocabularyNames() {
return vocs.keySet();
}
public void addVocabulary(final String id, final String name) { public void addVocabulary(final String id, final String name) {
vocs.put(id.toLowerCase(), new Vocabulary(id, name)); vocs.put(id.toLowerCase(), new Vocabulary(id, name));
} }
@ -118,7 +122,31 @@ public class VocabularyGroup implements Serializable {
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn); return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
} }
/**
* getSynonymAsQualifierCaseSensitive
*
* refelects the situation to check caseSensitive vocabulary
*/
public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) {
if (StringUtils.isBlank(vocId)) {
return OafMapperUtils.unknown("", "");
}
return vocs.get(vocId).getSynonymAsQualifier(syn);
}
/**
* termExists
*
* two methods: without and with caseSensitive check
*/
public boolean termExists(final String vocId, final String id) { public boolean termExists(final String vocId, final String id) {
return termExists(vocId, id, Boolean.FALSE);
}
public boolean termExists(final String vocId, final String id, final Boolean caseSensitive) {
if (Boolean.TRUE.equals(caseSensitive)) {
return vocabularyExists(vocId) && vocs.get(vocId).termExists(id);
}
return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id); return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id);
} }

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.raw.common; package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable; import java.io.Serializable;

View File

@ -0,0 +1,64 @@
package eu.dnetlib.dhp.message;
import java.io.Serializable;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
public class Message implements Serializable {
private static final long serialVersionUID = 401753881204524893L;
public static String CURRENT_PARAM = "current";
public static String TOTAL_PARAM = "total";
private MessageType messageType;
private String workflowId;
private Map<String, String> body;
public Message() {
}
public Message(final MessageType messageType, final String workflowId) {
this(messageType, workflowId, new LinkedHashMap<>());
}
public Message(final MessageType messageType, final String workflowId, final Map<String, String> body) {
this.messageType = messageType;
this.workflowId = workflowId;
this.body = body;
}
public MessageType getMessageType() {
return messageType;
}
public void setMessageType(MessageType messageType) {
this.messageType = messageType;
}
public String getWorkflowId() {
return workflowId;
}
public void setWorkflowId(final String workflowId) {
this.workflowId = workflowId;
}
public Map<String, String> getBody() {
return body;
}
public void setBody(final Map<String, String> body) {
this.body = body;
}
@Override
public String toString() {
return String.format("Message [type=%s, workflowId=%s, body=%s]", messageType, workflowId, body);
}
}

View File

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.message;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class MessageSender {
private static final Logger log = LoggerFactory.getLogger(MessageSender.class);
private static final int SOCKET_TIMEOUT_MS = 2000;
private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000;
private static final int CONNTECTION_TIMEOUT_MS = 2000;
private final ObjectMapper objectMapper = new ObjectMapper();
private final String dnetMessageEndpoint;
private final String workflowId;
private final ExecutorService executorService = Executors.newCachedThreadPool();
public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
this.workflowId = workflowId;
this.dnetMessageEndpoint = dnetMessageEndpoint;
}
public void sendMessage(final Message message) {
executorService.submit(() -> _sendMessage(message));
}
public void sendMessage(final Long current, final Long total) {
sendMessage(createOngoingMessage(current, total));
}
public void sendReport(final Map<String, String> report) {
sendMessage(new Message(MessageType.REPORT, workflowId, report));
}
private Message createOngoingMessage(final Long current, final Long total) {
final Message m = new Message(MessageType.ONGOING, workflowId);
m.getBody().put(Message.CURRENT_PARAM, current.toString());
if (total != null) {
m.getBody().put(Message.TOTAL_PARAM, total.toString());
}
return m;
}
private void _sendMessage(final Message message) {
try {
final String json = objectMapper.writeValueAsString(message);
final HttpPut req = new HttpPut(dnetMessageEndpoint);
req.setEntity(new StringEntity(json, ContentType.APPLICATION_JSON));
final RequestConfig requestConfig = RequestConfig
.custom()
.setConnectTimeout(CONNTECTION_TIMEOUT_MS)
.setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS)
.setSocketTimeout(SOCKET_TIMEOUT_MS)
.build();
try (final CloseableHttpClient client = HttpClients
.custom()
.setDefaultRequestConfig(requestConfig)
.build();
final CloseableHttpResponse response = client.execute(req)) {
log.debug("Sent Message to " + dnetMessageEndpoint);
log.debug("MESSAGE:" + message);
} catch (final Throwable e) {
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
}
} catch (final JsonProcessingException e) {
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
}
}
}

View File

@ -0,0 +1,21 @@
package eu.dnetlib.dhp.message;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
public enum MessageType implements Serializable {
ONGOING, REPORT;
public MessageType from(String value) {
return Optional
.ofNullable(value)
.map(StringUtils::upperCase)
.map(MessageType::valueOf)
.orElseThrow(() -> new IllegalArgumentException("unknown message type: " + value));
}
}

View File

@ -1,121 +0,0 @@
package eu.dnetlib.dhp.model.mdstore;
import java.io.Serializable;
import eu.dnetlib.dhp.utils.DHPUtils;
/** This class models a record inside the new Metadata store collection on HDFS * */
public class MetadataRecord implements Serializable {
/** The D-Net Identifier associated to the record */
private String id;
/** The original Identifier of the record */
private String originalId;
/** The encoding of the record, should be JSON or XML */
private String encoding;
/**
* The information about the provenance of the record see @{@link Provenance} for the model of this information
*/
private Provenance provenance;
/** The content of the metadata */
private String body;
/** the date when the record has been stored */
private long dateOfCollection;
/** the date when the record has been stored */
private long dateOfTransformation;
public MetadataRecord() {
this.dateOfCollection = System.currentTimeMillis();
}
public MetadataRecord(
String originalId,
String encoding,
Provenance provenance,
String body,
long dateOfCollection) {
this.originalId = originalId;
this.encoding = encoding;
this.provenance = provenance;
this.body = body;
this.dateOfCollection = dateOfCollection;
this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix());
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getOriginalId() {
return originalId;
}
public void setOriginalId(String originalId) {
this.originalId = originalId;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public Provenance getProvenance() {
return provenance;
}
public void setProvenance(Provenance provenance) {
this.provenance = provenance;
}
public String getBody() {
return body;
}
public void setBody(String body) {
this.body = body;
}
public long getDateOfCollection() {
return dateOfCollection;
}
public void setDateOfCollection(long dateOfCollection) {
this.dateOfCollection = dateOfCollection;
}
public long getDateOfTransformation() {
return dateOfTransformation;
}
public void setDateOfTransformation(long dateOfTransformation) {
this.dateOfTransformation = dateOfTransformation;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof MetadataRecord)) {
return false;
}
return ((MetadataRecord) o).getId().equalsIgnoreCase(id);
}
@Override
public int hashCode() {
return id.hashCode();
}
}

View File

@ -1,52 +0,0 @@
package eu.dnetlib.dhp.model.mdstore;
import java.io.Serializable;
/**
* @author Sandro La Bruzzo
* <p>
* Provenace class models the provenance of the record in the metadataStore It contains the identifier and the
* name of the datasource that gives the record
*/
public class Provenance implements Serializable {
private String datasourceId;
private String datasourceName;
private String nsPrefix;
public Provenance() {
}
public Provenance(String datasourceId, String datasourceName, String nsPrefix) {
this.datasourceId = datasourceId;
this.datasourceName = datasourceName;
this.nsPrefix = nsPrefix;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(String datasourceId) {
this.datasourceId = datasourceId;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(String datasourceName) {
this.datasourceName = datasourceName;
}
public String getNsPrefix() {
return nsPrefix;
}
public void setNsPrefix(String nsPrefix) {
this.nsPrefix = nsPrefix;
}
}

View File

@ -1,49 +0,0 @@
package eu.dnetlib.dhp.schema.oaf;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class ResultTypeComparator implements Comparator<Result> {
@Override
public int compare(Result left, Result right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
String lClass = left.getResulttype().getClassid();
String rClass = right.getResulttype().getClassid();
if (lClass.equals(rClass))
return 0;
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

View File

@ -1,33 +1,37 @@
package eu.dnetlib.dhp.oa.graph.clean; package eu.dnetlib.dhp.schema.oaf.utils;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.*; import java.util.*;
import java.util.function.Function; import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import com.clearspring.analytics.util.Lists; import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions { public class GraphCleaningFunctions extends CleaningFunctions {
public static final String DOI_PREFIX_REGEX = "^10\\.";
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
public static final int ORCID_LEN = 19; public static final int ORCID_LEN = 19;
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
public static final Set<String> PID_BLACKLIST = new HashSet<>(); public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
static {
PID_BLACKLIST.add("none");
PID_BLACKLIST.add("na");
}
public static <T extends Oaf> T fixVocabularyNames(T value) { public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
@ -59,23 +63,17 @@ public class CleaningFunctions {
} }
} }
if (Objects.nonNull(r.getAuthor())) { if (Objects.nonNull(r.getAuthor())) {
r r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> {
.getAuthor() if (Objects.nonNull(a.getPid())) {
.stream() a.getPid().stream().filter(Objects::nonNull).forEach(p -> {
.filter(Objects::nonNull) fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
.forEach(a -> { });
if (Objects.nonNull(a.getPid())) { }
a });
.getPid()
.stream()
.filter(Objects::nonNull)
.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES));
}
});
} }
if (value instanceof Publication) { if (value instanceof Publication) {
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { } else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) { } else if (value instanceof OtherResearchProduct) {
@ -87,7 +85,37 @@ public class CleaningFunctions {
return value; return value;
} }
public static <T extends Oaf> T fixDefaults(T value) { public static <T extends Oaf> boolean filter(T value) {
if (value instanceof Datasource) {
// nothing to evaluate here
} else if (value instanceof Project) {
// nothing to evaluate here
} else if (value instanceof Organization) {
// nothing to evaluate here
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
return false;
}
if (value instanceof Publication) {
} else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return true;
}
public static <T extends Oaf> T cleanup(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to clean here // nothing to clean here
} else if (value instanceof Project) { } else if (value instanceof Project) {
@ -98,10 +126,44 @@ public class CleaningFunctions {
o.setCountry(ModelConstants.UNKNOWN_COUNTRY); o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
} }
} else if (value instanceof Relation) { } else if (value instanceof Relation) {
// nothing to clean here Relation r = (Relation) value;
Optional<String> validationDate = doCleanDate(r.getValidationDate());
if (validationDate.isPresent()) {
r.setValidationDate(validationDate.get());
r.setValidated(true);
} else {
r.setValidationDate(null);
r.setValidated(false);
}
} else if (value instanceof Result) { } else if (value instanceof Result) {
Result r = (Result) value; Result r = (Result) value;
if (Objects.nonNull(r.getDateofacceptance())) {
Optional<String> date = cleanDateField(r.getDateofacceptance());
if (date.isPresent()) {
r.getDateofacceptance().setValue(date.get());
} else {
r.setDateofacceptance(null);
}
}
if (Objects.nonNull(r.getRelevantdate())) {
r
.setRelevantdate(
r
.getRelevantdate()
.stream()
.filter(Objects::nonNull)
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(sp -> {
sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue()));
return sp;
})
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
r.setPublisher(null); r.setPublisher(null);
} }
@ -110,16 +172,6 @@ public class CleaningFunctions {
.setLanguage( .setLanguage(
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
} }
if (Objects.nonNull(r.getCountry())) {
r
.setCountry(
r
.getCountry()
.stream()
.filter(Objects::nonNull)
.filter(c -> StringUtils.isNotBlank(c.getClassid()))
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getSubject())) { if (Objects.nonNull(r.getSubject())) {
r r
.setSubject( .setSubject(
@ -130,7 +182,7 @@ public class CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::cleanValue) .map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getTitle())) { if (Objects.nonNull(r.getTitle())) {
@ -141,7 +193,13 @@ public class CleaningFunctions {
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::cleanValue) .filter(
sp -> sp
.getValue()
.toLowerCase()
.replaceAll(TITLE_FILTER_REGEX, "")
.length() > TITLE_FILTER_RESIDUAL_LENGTH)
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getDescription())) { if (Objects.nonNull(r.getDescription())) {
@ -152,22 +210,11 @@ public class CleaningFunctions {
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::cleanValue) .map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getPid())) { if (Objects.nonNull(r.getPid())) {
r r.setPid(processPidCleaning(r.getPid()));
.setPid(
r
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.collect(Collectors.toList()));
} }
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r r
@ -175,11 +222,32 @@ public class CleaningFunctions {
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
} }
if (Objects.nonNull(r.getInstance())) { if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) { for (Instance i : r.getInstance()) {
if (Objects.nonNull(i.getPid())) {
i.setPid(processPidCleaning(i.getPid()));
}
if (Objects.nonNull(i.getAlternateIdentifier())) {
i.setAlternateIdentifier(processPidCleaning(i.getAlternateIdentifier()));
}
Optional
.ofNullable(i.getPid())
.ifPresent(pid -> {
final Set<StructuredProperty> pids = Sets.newHashSet(pid);
Optional
.ofNullable(i.getAlternateIdentifier())
.ifPresent(altId -> {
final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
});
});
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i i
.setAccessright( .setAccessright(
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES)); accessRight(
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES));
} }
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
@ -187,14 +255,24 @@ public class CleaningFunctions {
if (Objects.isNull(i.getRefereed())) { if (Objects.isNull(i.getRefereed())) {
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
} }
if (Objects.nonNull(i.getDateofacceptance())) {
Optional<String> date = cleanDateField(i.getDateofacceptance());
if (date.isPresent()) {
i.getDateofacceptance().setValue(date.get());
} else {
i.setDateofacceptance(null);
}
}
} }
} }
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance()); Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
if (Objects.isNull(bestaccessrights)) { if (Objects.isNull(bestaccessrights)) {
r r
.setBestaccessright( .setBestaccessright(
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES)); qualifier(
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES));
} else { } else {
r.setBestaccessright(bestaccessrights); r.setBestaccessright(bestaccessrights);
} }
@ -280,11 +358,10 @@ public class CleaningFunctions {
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
} }
} }
if (value instanceof Publication) { if (value instanceof Publication) {
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { } else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) { } else if (value instanceof OtherResearchProduct) {
@ -296,6 +373,79 @@ public class CleaningFunctions {
return value; return value;
} }
private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
return Optional
.ofNullable(dateofacceptance)
.map(Field::getValue)
.map(GraphCleaningFunctions::cleanDate)
.filter(Objects::nonNull);
}
protected static Optional<String> doCleanDate(String date) {
return Optional.ofNullable(cleanDate(date));
}
public static String cleanDate(final String inputDate) {
if (StringUtils.isBlank(inputDate)) {
return null;
}
try {
final LocalDate date = DateParserUtils
.parseDate(inputDate.trim())
.toInstant()
.atZone(ZoneId.systemDefault())
.toLocalDate();
return DateTimeFormatter.ofPattern(ModelSupport.DATE_FORMAT).format(date);
} catch (DateTimeParseException e) {
return null;
}
}
// HELPERS
private static boolean isValidAuthorName(Author a) {
return !Stream
.of(a.getFullname(), a.getName(), a.getSurname())
.filter(s -> s != null && !s.isEmpty())
.collect(Collectors.joining(""))
.toLowerCase()
.matches(INVALID_AUTHOR_REGEX);
}
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
return pids
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(Collectors.toList());
}
private static void fixVocabName(Qualifier q, String vocabularyName) {
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
q.setSchemeid(vocabularyName);
q.setSchemename(vocabularyName);
}
}
private static AccessRight accessRight(String classid, String classname, String scheme) {
return OafMapperUtils
.accessRight(
classid, classname, scheme, scheme);
}
private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils
.qualifier(
classid, classname, scheme, scheme);
}
protected static StructuredProperty cleanValue(StructuredProperty s) { protected static StructuredProperty cleanValue(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s; return s;
@ -306,39 +456,4 @@ public class CleaningFunctions {
return s; return s;
} }
// HELPERS
private static void fixVocabName(Qualifier q, String vocabularyName) {
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
q.setSchemeid(vocabularyName);
q.setSchemename(vocabularyName);
}
}
private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils
.qualifier(
classid, classname, scheme, scheme);
}
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
String value = Optional
.ofNullable(pid.getValue())
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pid.getQualifier().getClassid()) {
// TODO add cleaning for more PID types as needed
case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
break;
}
return pid;
}
} }

View File

@ -1,11 +1,9 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.ArrayList; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.util.Arrays;
import java.util.List; import java.util.*;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.Predicate; import java.util.function.Predicate;
@ -13,42 +11,45 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner; import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.schema.oaf.*;
public class OafMapperUtils { public class OafMapperUtils {
public static Oaf merge(final Oaf o1, final Oaf o2) { public static Oaf merge(final Oaf left, final Oaf right) {
if (ModelSupport.isSubClass(o1, OafEntity.class)) { if (ModelSupport.isSubClass(left, OafEntity.class)) {
if (ModelSupport.isSubClass(o1, Result.class)) { return mergeEntities((OafEntity) left, (OafEntity) right);
} else if (ModelSupport.isSubClass(left, Relation.class)) {
return mergeResults((Result) o1, (Result) o2); ((Relation) left).mergeFrom((Relation) right);
} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
((Datasource) o1).mergeFrom((Datasource) o2);
} else if (ModelSupport.isSubClass(o1, Organization.class)) {
((Organization) o1).mergeFrom((Organization) o2);
} else if (ModelSupport.isSubClass(o1, Project.class)) {
((Project) o1).mergeFrom((Project) o2);
} else {
throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
}
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
((Relation) o1).mergeFrom((Relation) o2);
} else { } else {
throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName()); throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName());
} }
return o1; return left;
} }
public static Result mergeResults(Result r1, Result r2) { public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
if (new ResultTypeComparator().compare(r1, r2) < 0) { if (ModelSupport.isSubClass(left, Result.class)) {
r1.mergeFrom(r2); return mergeResults((Result) left, (Result) right);
return r1; } else if (ModelSupport.isSubClass(left, Datasource.class)) {
left.mergeFrom(right);
} else if (ModelSupport.isSubClass(left, Organization.class)) {
left.mergeFrom(right);
} else if (ModelSupport.isSubClass(left, Project.class)) {
left.mergeFrom(right);
} else { } else {
r2.mergeFrom(r1); throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
return r2; }
return left;
}
public static Result mergeResults(Result left, Result right) {
if (new ResultTypeComparator().compare(left, right) < 0) {
left.mergeFrom(right);
return left;
} else {
right.mergeFrom(left);
return right;
} }
} }
@ -104,6 +105,29 @@ public class OafMapperUtils {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename); return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
} }
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
return accessRight(classid, classname, schemeid, schemename, null);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename,
final OpenAccessRoute openAccessRoute) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(classid);
accessRight.setClassname(classname);
accessRight.setSchemeid(schemeid);
accessRight.setSchemename(schemename);
accessRight.setOpenAccessRoute(openAccessRoute);
return accessRight;
}
public static Qualifier qualifier( public static Qualifier qualifier(
final String classid, final String classid,
final String classname, final String classname,
@ -117,6 +141,15 @@ public class OafMapperUtils {
return q; return q;
} }
public static Qualifier qualifier(final Qualifier qualifier) {
final Qualifier q = new Qualifier();
q.setClassid(qualifier.getClassid());
q.setClassname(qualifier.getClassname());
q.setSchemeid(qualifier.getSchemeid());
q.setSchemename(qualifier.getSchemename());
return q;
}
public static StructuredProperty structuredProperty( public static StructuredProperty structuredProperty(
final String value, final String value,
final String classid, final String classid,
@ -267,7 +300,7 @@ public class OafMapperUtils {
} else if (to_md5) { } else if (to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::"); final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::"); final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest));
} else { } else {
return String.format("%s|%s", prefix, originalId); return String.format("%s|%s", prefix, originalId);
} }
@ -300,4 +333,36 @@ public class OafMapperUtils {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>(); final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null; return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
} }
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<AccessRight> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new AccessRightComparator<>());
final Qualifier rights = min.isPresent() ? qualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
} }

View File

@ -1,18 +1,29 @@
package eu.dnetlib.dhp.utils; package eu.dnetlib.dhp.utils;
import java.io.ByteArrayInputStream; import java.io.*;
import java.io.ByteArrayOutputStream;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.security.MessageDigest; import java.security.MessageDigest;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream; import java.util.zip.GZIPOutputStream;
import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.binary.Base64OutputStream; import org.apache.commons.codec.binary.Base64OutputStream;
import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.JsonPath;
import net.minidev.json.JSONArray; import net.minidev.json.JSONArray;
@ -21,6 +32,8 @@ import scala.collection.Seq;
public class DHPUtils { public class DHPUtils {
private static final Logger log = LoggerFactory.getLogger(DHPUtils.class);
public static Seq<String> toSeq(List<String> list) { public static Seq<String> toSeq(List<String> list) {
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
} }
@ -79,4 +92,72 @@ public class DHPUtils {
return ""; return "";
} }
} }
public static final ObjectMapper MAPPER = new ObjectMapper();
public static void writeHdfsFile(final Configuration conf, final String content, final String path)
throws IOException {
log.info("writing file {}, size {}", path, content.length());
try (FileSystem fs = FileSystem.get(conf);
BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
os.write(content.getBytes(StandardCharsets.UTF_8));
os.flush();
}
}
public static String readHdfsFile(Configuration conf, String path) throws IOException {
log.info("reading file {}", path);
try (FileSystem fs = FileSystem.get(conf)) {
final Path p = new Path(path);
if (!fs.exists(p)) {
throw new FileNotFoundException(path);
}
return IOUtils.toString(fs.open(p));
}
}
public static <T> T readHdfsFileAs(Configuration conf, String path, Class<T> clazz) throws IOException {
return MAPPER.readValue(readHdfsFile(conf, path), clazz);
}
public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
log.info("saving dataset in: {}", targetPath);
mdstore
.write()
.mode(SaveMode.Overwrite)
.format("parquet")
.save(targetPath);
}
public static Configuration getHadoopConfiguration(String nameNode) {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", nameNode);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("hadoop.home.dir", "/");
return conf;
}
public static void populateOOZIEEnv(final Map<String, String> report) throws IOException {
File file = new File(System.getProperty("oozie.action.output.properties"));
Properties props = new Properties();
report.forEach((k, v) -> props.setProperty(k, v));
try (OutputStream os = new FileOutputStream(file)) {
props.store(os, "");
}
}
public static void populateOOZIEEnv(final String paramName, String value) throws IOException {
Map<String, String> report = Maps.newHashMap();
report.put(paramName, value);
populateOOZIEEnv(report);
}
} }

View File

@ -15,8 +15,8 @@ public class ISLookupClientFactory {
private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class); private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class);
private static int requestTimeout = 60000 * 10; private static final int requestTimeout = 60000 * 10;
private static int connectTimeout = 60000 * 10; private static final int connectTimeout = 60000 * 10;
public static ISLookUpService getLookUpService(final String isLookupUrl) { public static ISLookUpService getLookUpService(final String isLookupUrl) {
return getServiceStub(ISLookUpService.class, isLookupUrl); return getServiceStub(ISLookUpService.class, isLookupUrl);

View File

@ -1,76 +0,0 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.util.Map;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class Message {
private String workflowId;
private String jobName;
private MessageType type;
private Map<String, String> body;
public static Message fromJson(final String json) throws IOException {
final ObjectMapper jsonMapper = new ObjectMapper();
return jsonMapper.readValue(json, Message.class);
}
public Message() {
}
public Message(String workflowId, String jobName, MessageType type, Map<String, String> body) {
this.workflowId = workflowId;
this.jobName = jobName;
this.type = type;
this.body = body;
}
public String getWorkflowId() {
return workflowId;
}
public void setWorkflowId(String workflowId) {
this.workflowId = workflowId;
}
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public MessageType getType() {
return type;
}
public void setType(MessageType type) {
this.type = type;
}
public Map<String, String> getBody() {
return body;
}
public void setBody(Map<String, String> body) {
this.body = body;
}
@Override
public String toString() {
final ObjectMapper jsonMapper = new ObjectMapper();
try {
return jsonMapper.writeValueAsString(this);
} catch (JsonProcessingException e) {
return null;
}
}
}

View File

@ -1,47 +0,0 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.LinkedBlockingQueue;
import com.rabbitmq.client.AMQP;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.DefaultConsumer;
import com.rabbitmq.client.Envelope;
public class MessageConsumer extends DefaultConsumer {
final LinkedBlockingQueue<Message> queueMessages;
/**
* Constructs a new instance and records its association to the passed-in channel.
*
* @param channel the channel to which this consumer is attached
* @param queueMessages
*/
public MessageConsumer(Channel channel, LinkedBlockingQueue<Message> queueMessages) {
super(channel);
this.queueMessages = queueMessages;
}
@Override
public void handleDelivery(
String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body)
throws IOException {
final String json = new String(body, StandardCharsets.UTF_8);
Message message = Message.fromJson(json);
try {
this.queueMessages.put(message);
System.out.println("Receiving Message " + message);
} catch (InterruptedException e) {
if (message.getType() == MessageType.REPORT)
throw new RuntimeException("Error on sending message");
else {
// TODO LOGGING EXCEPTION
}
} finally {
getChannel().basicAck(envelope.getDeliveryTag(), false);
}
}
}

View File

@ -1,136 +0,0 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeoutException;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.Connection;
import com.rabbitmq.client.ConnectionFactory;
public class MessageManager {
private final String messageHost;
private final String username;
private final String password;
private Connection connection;
private final Map<String, Channel> channels = new HashMap<>();
private boolean durable;
private boolean autodelete;
private final LinkedBlockingQueue<Message> queueMessages;
public MessageManager(
String messageHost,
String username,
String password,
final LinkedBlockingQueue<Message> queueMessages) {
this.queueMessages = queueMessages;
this.messageHost = messageHost;
this.username = username;
this.password = password;
}
public MessageManager(
String messageHost,
String username,
String password,
boolean durable,
boolean autodelete,
final LinkedBlockingQueue<Message> queueMessages) {
this.queueMessages = queueMessages;
this.messageHost = messageHost;
this.username = username;
this.password = password;
this.durable = durable;
this.autodelete = autodelete;
}
private Connection createConnection() throws IOException, TimeoutException {
ConnectionFactory factory = new ConnectionFactory();
factory.setHost(this.messageHost);
factory.setUsername(this.username);
factory.setPassword(this.password);
return factory.newConnection();
}
private Channel createChannel(
final Connection connection,
final String queueName,
final boolean durable,
final boolean autodelete)
throws Exception {
Map<String, Object> args = new HashMap<>();
args.put("x-message-ttl", 10000);
Channel channel = connection.createChannel();
channel.queueDeclare(queueName, durable, false, this.autodelete, args);
return channel;
}
private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete)
throws Exception {
if (channels.containsKey(queueName)) {
return channels.get(queueName);
}
if (this.connection == null) {
this.connection = createConnection();
}
channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete));
return channels.get(queueName);
}
public void close() throws IOException {
channels
.values()
.forEach(
ch -> {
try {
ch.close();
} catch (Exception e) {
// TODO LOG
}
});
this.connection.close();
}
public boolean sendMessage(final Message message, String queueName) throws Exception {
try {
Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete);
channel.basicPublish("", queueName, null, message.toString().getBytes());
return true;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
public boolean sendMessage(
final Message message, String queueName, boolean durable_var, boolean autodelete_var)
throws Exception {
try {
Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var);
channel.basicPublish("", queueName, null, message.toString().getBytes());
return true;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
public void startConsumingMessage(
final String queueName, final boolean durable, final boolean autodelete) throws Exception {
Channel channel = createChannel(createConnection(), queueName, durable, autodelete);
channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages));
}
}

View File

@ -1,6 +0,0 @@
package eu.dnetlib.message;
public enum MessageType {
ONGOING, REPORT
}

File diff suppressed because one or more lines are too long

View File

@ -1,16 +0,0 @@
package eu.dnetlib.dhp.model.mdstore;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Test;
public class MetadataRecordTest {
@Test
public void getTimestamp() {
MetadataRecord r = new MetadataRecord();
assertTrue(r.getDateOfCollection() > 0);
}
}

View File

@ -0,0 +1,180 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
public class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testDateValidation() {
assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate(" 2016-04-05").isPresent());
assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get());
assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 MST 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get());
assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get());
assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get());
assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get());
assertEquals(
"2015-07-03",
GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get());
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get());
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get());
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get());
assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get());
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get());
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get());
assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get());
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get());
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get());
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get());
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get());
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get());
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get());
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get());
assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get());
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get());
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get());
assertEquals(
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get());
assertEquals(
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get());
assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get());
assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get());
assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get());
assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get());
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get());
assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get());
assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get());
assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get());
assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get());
}
@Test
public void testDate() {
System.out.println(GraphCleaningFunctions.cleanDate("23-FEB-1998"));
}
@Test
public void testMergePubs() throws IOException {
Publication p1 = read("publication_1.json", Publication.class);
Publication p2 = read("publication_2.json", Publication.class);
Dataset d1 = read("dataset_1.json", Dataset.class);
Dataset d2 = read("dataset_2.json", Dataset.class);
assertEquals(p1.getCollectedfrom().size(), 1);
assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID);
assertEquals(d2.getCollectedfrom().size(), 1);
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertTrue(
OafMapperUtils
.mergeResults(p1, d2)
.getResulttype()
.getClassid()
.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID));
assertEquals(p2.getCollectedfrom().size(), 1);
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(d1.getCollectedfrom().size(), 1);
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertTrue(
OafMapperUtils
.mergeResults(p2, d1)
.getResulttype()
.getClassid()
.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID));
}
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new));
}
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
return OBJECT_MAPPER.readValue(json, clazz);
}
}

View File

@ -1,51 +0,0 @@
package eu.dnetlib.message;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
public class MessageTest {
@Test
public void fromJsonTest() throws IOException {
Message m = new Message();
m.setWorkflowId("wId");
m.setType(MessageType.ONGOING);
m.setJobName("Collection");
Map<String, String> body = new HashMap<>();
body.put("parsedItem", "300");
body.put("ExecutionTime", "30s");
m.setBody(body);
System.out.println("m = " + m);
Message m1 = Message.fromJson(m.toString());
assertEquals(m1.getWorkflowId(), m.getWorkflowId());
assertEquals(m1.getType(), m.getType());
assertEquals(m1.getJobName(), m.getJobName());
assertNotNull(m1.getBody());
m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it)));
assertEquals(m1.getJobName(), m.getJobName());
}
@Test
public void toStringTest() {
final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}";
Message m = new Message();
m.setWorkflowId("wId");
m.setType(MessageType.ONGOING);
m.setJobName("Collection");
Map<String, String> body = new HashMap<>();
body.put("parsedItem", "300");
body.put("ExecutionTime", "30s");
m.setBody(body);
assertEquals(expectedJson, m.toString());
}
}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -51,16 +51,6 @@
<artifactId>hadoop-distcp</artifactId> <artifactId>hadoop-distcp</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-data-protos</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
</dependency>
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId> <artifactId>dnet-actionmanager-api</artifactId>

View File

@ -1,69 +0,0 @@
package eu.dnetlib.dhp.actionmanager.migration;
import java.util.Comparator;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
public class LicenseComparator implements Comparator<Qualifier> {
@Override
public int compare(Qualifier left, Qualifier right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
String lClass = left.getClassid();
String rClass = right.getClassid();
if (lClass.equals(rClass))
return 0;
if (lClass.equals("OPEN SOURCE"))
return -1;
if (rClass.equals("OPEN SOURCE"))
return 1;
if (lClass.equals("OPEN"))
return -1;
if (rClass.equals("OPEN"))
return 1;
if (lClass.equals("6MONTHS"))
return -1;
if (rClass.equals("6MONTHS"))
return 1;
if (lClass.equals("12MONTHS"))
return -1;
if (rClass.equals("12MONTHS"))
return 1;
if (lClass.equals("EMBARGO"))
return -1;
if (rClass.equals("EMBARGO"))
return 1;
if (lClass.equals("RESTRICTED"))
return -1;
if (rClass.equals("RESTRICTED"))
return 1;
if (lClass.equals("CLOSED"))
return -1;
if (rClass.equals("CLOSED"))
return 1;
if (lClass.equals("UNKNOWN"))
return -1;
if (rClass.equals("UNKNOWN"))
return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

View File

@ -1,196 +0,0 @@
package eu.dnetlib.dhp.actionmanager.migration;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.DistCp;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class MigrateActionSet {
private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class);
private static final String SEPARATOR = "/";
private static final String TARGET_PATHS = "target_paths";
private static final String RAWSET_PREFIX = "rawset_";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
MigrateActionSet.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json")));
parser.parseArgument(args);
new MigrateActionSet().run(parser);
}
private void run(ArgumentApplicationParser parser) throws Exception {
final String isLookupUrl = parser.get("isLookupUrl");
final String sourceNN = parser.get("sourceNameNode");
final String targetNN = parser.get("targetNameNode");
final String workDir = parser.get("workingDirectory");
final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps"));
final String distcp_memory_mb = parser.get("distcp_memory_mb");
final String distcp_task_timeout = parser.get("distcp_task_timeout");
final String transform_only_s = parser.get("transform_only");
log.info("transform only param: {}", transform_only_s);
final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only"));
log.info("transform only: {}", transformOnly);
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
FileSystem targetFS = FileSystem.get(conf);
Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN);
FileSystem sourceFS = FileSystem.get(sourceConf);
Properties props = new Properties();
List<Path> targetPaths = new ArrayList<>();
final List<Path> sourcePaths = getSourcePaths(sourceNN, isLookUp);
log
.info(
"paths to process:\n{}", sourcePaths
.stream()
.map(p -> p.toString())
.collect(Collectors.joining("\n")));
for (Path source : sourcePaths) {
if (!sourceFS.exists(source)) {
log.warn("skipping unexisting path: {}", source);
} else {
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath()));
final String rawSet = pathQ.pollLast();
log.info("got RAWSET: {}", rawSet);
if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) {
final String actionSetDirectory = pathQ.pollLast();
final Path targetPath = new Path(
targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet);
log.info("using TARGET PATH: {}", targetPath);
if (!transformOnly) {
if (targetFS.exists(targetPath)) {
targetFS.delete(targetPath, true);
}
runDistcp(
distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath);
}
targetPaths.add(targetPath);
}
}
}
final String targetPathsCsv = targetPaths.stream().map(p -> p.toString()).collect(Collectors.joining(","));
props.setProperty(TARGET_PATHS, targetPathsCsv);
File file = new File(System.getProperty("oozie.action.output.properties"));
try (OutputStream os = new FileOutputStream(file)) {
props.store(os, "");
}
System.out.println(file.getAbsolutePath());
}
private void runDistcp(
Integer distcp_num_maps,
String distcp_memory_mb,
String distcp_task_timeout,
Configuration conf,
Path source,
Path targetPath)
throws Exception {
final DistCpOptions op = new DistCpOptions(source, targetPath);
op.setMaxMaps(distcp_num_maps);
op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE);
op.preserve(DistCpOptions.FileAttribute.REPLICATION);
op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE);
int res = ToolRunner
.run(
new DistCp(conf, op),
new String[] {
"-Dmapred.task.timeout=" + distcp_task_timeout,
"-Dmapreduce.map.memory.mb=" + distcp_memory_mb,
"-pb",
"-m " + distcp_num_maps,
source.toString(),
targetPath.toString()
});
if (res != 0) {
throw new RuntimeException(String.format("distcp exited with code %s", res));
}
}
private Configuration getConfiguration(
String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) {
final Configuration conf = new Configuration();
conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout);
conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout);
conf.set("dfs.http.client.retry.policy.enabled", "true");
conf.set("mapred.task.timeout", distcp_task_timeout);
conf.set("mapreduce.map.memory.mb", distcp_memory_mb);
conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps));
return conf;
}
private List<Path> getSourcePaths(String sourceNN, ISLookUpService isLookUp)
throws ISLookUpException {
String XQUERY = "distinct-values(\n"
+ "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n"
+ "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n"
+ "let $setDir := $x//SET/@directory/string()\n"
+ "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n"
+ "return concat($basePath, '/', $setDir, '/', $rawSet))";
log.info(String.format("running xquery:\n%s", XQUERY));
return isLookUp
.quickSearchProfile(XQUERY)
.stream()
.map(p -> sourceNN + p)
.map(Path::new)
.collect(Collectors.toList());
}
}

View File

@ -1,710 +0,0 @@
package eu.dnetlib.dhp.actionmanager.migration;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.data.proto.*;
import eu.dnetlib.dhp.schema.oaf.*;
public class ProtoConverter implements Serializable {
public static Oaf convert(OafProtos.Oaf oaf) {
try {
switch (oaf.getKind()) {
case entity:
return convertEntity(oaf);
case relation:
return convertRelation(oaf);
default:
throw new IllegalArgumentException("invalid kind " + oaf.getKind());
}
} catch (Throwable e) {
throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e);
}
}
private static Relation convertRelation(OafProtos.Oaf oaf) {
final OafProtos.OafRel r = oaf.getRel();
final Relation rel = new Relation();
rel.setDataInfo(mapDataInfo(oaf.getDataInfo()));
rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp());
rel.setSource(r.getSource());
rel.setTarget(r.getTarget());
rel.setRelType(r.getRelType().toString());
rel.setSubRelType(r.getSubRelType().toString());
rel.setRelClass(r.getRelClass());
rel
.setCollectedfrom(
r.getCollectedfromCount() > 0
? r.getCollectedfromList().stream().map(kv -> mapKV(kv)).collect(Collectors.toList())
: null);
return rel;
}
private static OafEntity convertEntity(OafProtos.Oaf oaf) {
switch (oaf.getEntity().getType()) {
case result:
final Result r = convertResult(oaf);
r.setInstance(convertInstances(oaf));
r.setExternalReference(convertExternalRefs(oaf));
return r;
case project:
return convertProject(oaf);
case datasource:
return convertDataSource(oaf);
case organization:
return convertOrganization(oaf);
default:
throw new RuntimeException("received unknown type");
}
}
private static List<Instance> convertInstances(OafProtos.Oaf oaf) {
final ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getInstanceCount() > 0) {
return r.getInstanceList().stream().map(i -> convertInstance(i)).collect(Collectors.toList());
}
return Lists.newArrayList();
}
private static Instance convertInstance(ResultProtos.Result.Instance ri) {
final Instance i = new Instance();
i.setAccessright(mapQualifier(ri.getAccessright()));
i.setCollectedfrom(mapKV(ri.getCollectedfrom()));
i.setDateofacceptance(mapStringField(ri.getDateofacceptance()));
i.setDistributionlocation(ri.getDistributionlocation());
i.setHostedby(mapKV(ri.getHostedby()));
i.setInstancetype(mapQualifier(ri.getInstancetype()));
i.setLicense(mapStringField(ri.getLicense()));
i
.setUrl(
ri.getUrlList() != null ? ri
.getUrlList()
.stream()
.distinct()
.collect(Collectors.toCollection(ArrayList::new)) : null);
i.setRefereed(mapRefereed(ri.getRefereed()));
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
return i;
}
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
Qualifier q = new Qualifier();
q.setClassid(refereed.getValue());
q.setSchemename(refereed.getValue());
q.setSchemeid("dnet:review_levels");
q.setSchemename("dnet:review_levels");
return q;
}
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getExternalReferenceCount() > 0) {
return r
.getExternalReferenceList()
.stream()
.map(e -> convertExtRef(e))
.collect(Collectors.toList());
}
return Lists.newArrayList();
}
private static ExternalReference convertExtRef(ResultProtos.Result.ExternalReference e) {
ExternalReference ex = new ExternalReference();
ex.setUrl(e.getUrl());
ex.setSitename(e.getSitename());
ex.setRefidentifier(e.getRefidentifier());
ex.setQuery(e.getQuery());
ex.setQualifier(mapQualifier(e.getQualifier()));
ex.setLabel(e.getLabel());
ex.setDescription(e.getDescription());
ex.setDataInfo(ex.getDataInfo());
return ex;
}
private static Organization convertOrganization(OafProtos.Oaf oaf) {
final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata();
final Organization org = setOaf(new Organization(), oaf);
setEntity(org, oaf);
org.setLegalshortname(mapStringField(m.getLegalshortname()));
org.setLegalname(mapStringField(m.getLegalname()));
org
.setAlternativeNames(
m
.getAlternativeNamesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
org.setWebsiteurl(mapStringField(m.getWebsiteurl()));
org.setLogourl(mapStringField(m.getLogourl()));
org.setEclegalbody(mapStringField(m.getEclegalbody()));
org.setEclegalperson(mapStringField(m.getEclegalperson()));
org.setEcnonprofit(mapStringField(m.getEcnonprofit()));
org.setEcresearchorganization(mapStringField(m.getEcresearchorganization()));
org.setEchighereducation(mapStringField(m.getEchighereducation()));
org
.setEcinternationalorganizationeurinterests(
mapStringField(m.getEcinternationalorganizationeurinterests()));
org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization()));
org.setEcenterprise(mapStringField(m.getEcenterprise()));
org.setEcsmevalidated(mapStringField(m.getEcsmevalidated()));
org.setEcnutscode(mapStringField(m.getEcnutscode()));
org.setCountry(mapQualifier(m.getCountry()));
return org;
}
private static Datasource convertDataSource(OafProtos.Oaf oaf) {
final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata();
final Datasource datasource = setOaf(new Datasource(), oaf);
setEntity(datasource, oaf);
datasource
.setAccessinfopackage(
m
.getAccessinfopackageList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setCertificates(mapStringField(m.getCertificates()));
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
datasource.setContactemail(mapStringField(m.getContactemail()));
datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction()));
datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype()));
datasource.setDataprovider(mapBoolField(m.getDataprovider()));
datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype()));
datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction()));
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
datasource.setDatauploadtype(mapStringField(m.getDatauploadtype()));
datasource.setDateofvalidation(mapStringField(m.getDateofvalidation()));
datasource.setDescription(mapStringField(m.getDescription()));
datasource.setEnglishname(mapStringField(m.getEnglishname()));
datasource.setLatitude(mapStringField(m.getLatitude()));
datasource.setLongitude(mapStringField(m.getLongitude()));
datasource.setLogourl(mapStringField(m.getLogourl()));
datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl()));
datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix()));
datasource
.setOdcontenttypes(
m
.getOdcontenttypesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource
.setOdlanguages(
m
.getOdlanguagesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems()));
datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate()));
datasource.setOdpolicies(mapStringField(m.getOdpolicies()));
datasource.setOfficialname(mapStringField(m.getOfficialname()));
datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility()));
datasource.setPidsystems(mapStringField(m.getPidsystems()));
datasource
.setPolicies(
m.getPoliciesList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList()));
datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind()));
datasource.setReleaseenddate(mapStringField(m.getReleaseenddate()));
datasource.setServiceprovider(mapBoolField(m.getServiceprovider()));
datasource.setReleasestartdate(mapStringField(m.getReleasestartdate()));
datasource
.setSubjects(
m
.getSubjectsList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
datasource.setVersioning(mapBoolField(m.getVersioning()));
datasource.setWebsiteurl(mapStringField(m.getWebsiteurl()));
datasource.setJournal(mapJournal(m.getJournal()));
return datasource;
}
private static Project convertProject(OafProtos.Oaf oaf) {
final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata();
final Project project = setOaf(new Project(), oaf);
setEntity(project, oaf);
project.setAcronym(mapStringField(m.getAcronym()));
project.setCallidentifier(mapStringField(m.getCallidentifier()));
project.setCode(mapStringField(m.getCode()));
project.setContactemail(mapStringField(m.getContactemail()));
project.setContactfax(mapStringField(m.getContactfax()));
project.setContactfullname(mapStringField(m.getContactfullname()));
project.setContactphone(mapStringField(m.getContactphone()));
project.setContracttype(mapQualifier(m.getContracttype()));
project.setCurrency(mapStringField(m.getCurrency()));
project.setDuration(mapStringField(m.getDuration()));
project.setEcarticle29_3(mapStringField(m.getEcarticle293()));
project.setEcsc39(mapStringField(m.getEcsc39()));
project.setOamandatepublications(mapStringField(m.getOamandatepublications()));
project.setStartdate(mapStringField(m.getStartdate()));
project.setEnddate(mapStringField(m.getEnddate()));
project.setFundedamount(m.getFundedamount());
project.setTotalcost(m.getTotalcost());
project.setKeywords(mapStringField(m.getKeywords()));
project
.setSubjects(
m
.getSubjectsList()
.stream()
.map(sp -> mapStructuredProperty(sp))
.collect(Collectors.toList()));
project.setTitle(mapStringField(m.getTitle()));
project.setWebsiteurl(mapStringField(m.getWebsiteurl()));
project
.setFundingtree(
m.getFundingtreeList().stream().map(f -> mapStringField(f)).collect(Collectors.toList()));
project.setJsonextrainfo(mapStringField(m.getJsonextrainfo()));
project.setSummary(mapStringField(m.getSummary()));
project.setOptional1(mapStringField(m.getOptional1()));
project.setOptional2(mapStringField(m.getOptional2()));
return project;
}
private static Result convertResult(OafProtos.Oaf oaf) {
switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) {
case "dataset":
return createDataset(oaf);
case "publication":
return createPublication(oaf);
case "software":
return createSoftware(oaf);
case "other":
return createORP(oaf);
default:
Result result = setOaf(new Result(), oaf);
setEntity(result, oaf);
return setResult(result, oaf);
}
}
private static Software createSoftware(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Software software = setOaf(new Software(), oaf);
setEntity(software, oaf);
setResult(software, oaf);
software
.setDocumentationUrl(
m
.getDocumentationUrlList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
software
.setLicense(
m
.getLicenseList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl()));
software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage()));
return software;
}
private static OtherResearchProduct createORP(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf);
setEntity(otherResearchProducts, oaf);
setResult(otherResearchProducts, oaf);
otherResearchProducts
.setContactperson(
m
.getContactpersonList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
otherResearchProducts
.setContactgroup(
m
.getContactgroupList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
otherResearchProducts
.setTool(
m.getToolList().stream().map(ProtoConverter::mapStringField).collect(Collectors.toList()));
return otherResearchProducts;
}
private static Publication createPublication(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Publication publication = setOaf(new Publication(), oaf);
setEntity(publication, oaf);
setResult(publication, oaf);
publication.setJournal(mapJournal(m.getJournal()));
return publication;
}
private static Dataset createDataset(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Dataset dataset = setOaf(new Dataset(), oaf);
setEntity(dataset, oaf);
setResult(dataset, oaf);
dataset.setStoragedate(mapStringField(m.getStoragedate()));
dataset.setDevice(mapStringField(m.getDevice()));
dataset.setSize(mapStringField(m.getSize()));
dataset.setVersion(mapStringField(m.getVersion()));
dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate()));
dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber()));
dataset
.setGeolocation(
m
.getGeolocationList()
.stream()
.map(ProtoConverter::mapGeolocation)
.collect(Collectors.toList()));
return dataset;
}
public static <T extends Oaf> T setOaf(T oaf, OafProtos.Oaf o) {
oaf.setDataInfo(mapDataInfo(o.getDataInfo()));
oaf.setLastupdatetimestamp(o.getLastupdatetimestamp());
return oaf;
}
public static <T extends OafEntity> T setEntity(T entity, OafProtos.Oaf oaf) {
// setting Entity fields
final OafProtos.OafEntity e = oaf.getEntity();
entity.setId(e.getId());
entity.setOriginalId(e.getOriginalIdList());
entity
.setCollectedfrom(
e.getCollectedfromList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList()));
entity
.setPid(
e
.getPidList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setDateofcollection(e.getDateofcollection());
entity.setDateoftransformation(e.getDateoftransformation());
entity
.setExtraInfo(
e
.getExtraInfoList()
.stream()
.map(ProtoConverter::mapExtraInfo)
.collect(Collectors.toList()));
return entity;
}
public static <T extends Result> T setResult(T entity, OafProtos.Oaf oaf) {
// setting Entity fields
final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
entity
.setAuthor(
m.getAuthorList().stream().map(ProtoConverter::mapAuthor).collect(Collectors.toList()));
entity.setResulttype(mapQualifier(m.getResulttype()));
entity.setLanguage(mapQualifier(m.getLanguage()));
entity
.setCountry(
m
.getCountryList()
.stream()
.map(ProtoConverter::mapQualifierAsCountry)
.collect(Collectors.toList()));
entity
.setSubject(
m
.getSubjectList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity
.setTitle(
m
.getTitleList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity
.setRelevantdate(
m
.getRelevantdateList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity
.setDescription(
m
.getDescriptionList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setDateofacceptance(mapStringField(m.getDateofacceptance()));
entity.setPublisher(mapStringField(m.getPublisher()));
entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate()));
entity
.setSource(
m
.getSourceList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setFulltext(
m
.getFulltextList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setFormat(
m
.getFormatList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setContributor(
m
.getContributorList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setResourcetype(mapQualifier(m.getResourcetype()));
entity
.setCoverage(
m
.getCoverageList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setContext(
m.getContextList().stream().map(ProtoConverter::mapContext).collect(Collectors.toList()));
entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList()));
return entity;
}
private static Qualifier getBestAccessRights(List<ResultProtos.Result.Instance> instanceList) {
if (instanceList != null) {
final Optional<FieldTypeProtos.Qualifier> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new LicenseComparator());
final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
private static Context mapContext(ResultProtos.Result.Context context) {
if (context == null || StringUtils.isBlank(context.getId())) {
return null;
}
final Context entity = new Context();
entity.setId(context.getId());
entity
.setDataInfo(
context
.getDataInfoList()
.stream()
.map(ProtoConverter::mapDataInfo)
.collect(Collectors.toList()));
return entity;
}
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) {
return null;
}
final KeyValue keyValue = new KeyValue();
keyValue.setKey(kv.getKey());
keyValue.setValue(kv.getValue());
keyValue.setDataInfo(mapDataInfo(kv.getDataInfo()));
return keyValue;
}
public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) {
final DataInfo dataInfo = new DataInfo();
dataInfo.setDeletedbyinference(d.getDeletedbyinference());
dataInfo.setInferenceprovenance(d.getInferenceprovenance());
dataInfo.setInferred(d.getInferred());
dataInfo.setInvisible(d.getInvisible());
dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction()));
dataInfo.setTrust(d.getTrust());
return dataInfo;
}
public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) {
final Qualifier qualifier = new Qualifier();
qualifier.setClassid(q.getClassid());
qualifier.setClassname(q.getClassname());
qualifier.setSchemeid(q.getSchemeid());
qualifier.setSchemename(q.getSchemename());
return qualifier;
}
public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) {
final Country c = new Country();
c.setClassid(q.getClassid());
c.setClassname(q.getClassname());
c.setSchemeid(q.getSchemeid());
c.setSchemename(q.getSchemename());
c.setDataInfo(mapDataInfo(q.getDataInfo()));
return c;
}
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
if (sp == null | StringUtils.isBlank(sp.getValue())) {
return null;
}
final StructuredProperty structuredProperty = new StructuredProperty();
structuredProperty.setValue(sp.getValue());
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo()));
return structuredProperty;
}
public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) {
final ExtraInfo entity = new ExtraInfo();
entity.setName(extraInfo.getName());
entity.setTypology(extraInfo.getTypology());
entity.setProvenance(extraInfo.getProvenance());
entity.setTrust(extraInfo.getTrust());
entity.setValue(extraInfo.getValue());
return entity;
}
public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) {
final OAIProvenance entity = new OAIProvenance();
entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription()));
return entity;
}
public static OriginDescription mapOriginalDescription(
FieldTypeProtos.OAIProvenance.OriginDescription originDescription) {
final OriginDescription originDescriptionResult = new OriginDescription();
originDescriptionResult.setHarvestDate(originDescription.getHarvestDate());
originDescriptionResult.setAltered(originDescription.getAltered());
originDescriptionResult.setBaseURL(originDescription.getBaseURL());
originDescriptionResult.setIdentifier(originDescription.getIdentifier());
originDescriptionResult.setDatestamp(originDescription.getDatestamp());
originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace());
return originDescriptionResult;
}
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
if (s == null || StringUtils.isBlank(s.getValue())) {
return null;
}
final Field<String> stringField = new Field<>();
stringField.setValue(s.getValue());
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
return stringField;
}
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
if (b == null) {
return null;
}
final Field<Boolean> booleanField = new Field<>();
booleanField.setValue(b.getValue());
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
return booleanField;
}
public static Journal mapJournal(FieldTypeProtos.Journal j) {
final Journal journal = new Journal();
journal.setConferencedate(j.getConferencedate());
journal.setConferenceplace(j.getConferenceplace());
journal.setEdition(j.getEdition());
journal.setEp(j.getEp());
journal.setIss(j.getIss());
journal.setIssnLinking(j.getIssnLinking());
journal.setIssnOnline(j.getIssnOnline());
journal.setIssnPrinted(j.getIssnPrinted());
journal.setName(j.getName());
journal.setSp(j.getSp());
journal.setVol(j.getVol());
journal.setDataInfo(mapDataInfo(j.getDataInfo()));
return journal;
}
public static Author mapAuthor(FieldTypeProtos.Author author) {
final Author entity = new Author();
entity.setFullname(author.getFullname());
entity.setName(author.getName());
entity.setSurname(author.getSurname());
entity.setRank(author.getRank());
entity
.setPid(
author
.getPidList()
.stream()
.map(
kv -> {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(kv.getValue());
final Qualifier q = new Qualifier();
q.setClassid(kv.getKey());
q.setClassname(kv.getKey());
sp.setQualifier(q);
return sp;
})
.collect(Collectors.toList()));
entity
.setAffiliation(
author
.getAffiliationList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
return entity;
}
public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) {
final GeoLocation entity = new GeoLocation();
entity.setPoint(geoLocation.getPoint());
entity.setBox(geoLocation.getBox());
entity.setPlace(geoLocation.getPlace());
return entity;
}
}

View File

@ -1,172 +0,0 @@
package eu.dnetlib.dhp.actionmanager.migration;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.LinkedList;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.protobuf.InvalidProtocolBufferException;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class TransformActions implements Serializable {
private static final Logger log = LoggerFactory.getLogger(TransformActions.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String SEPARATOR = "/";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
MigrateActionSet.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String inputPaths = parser.get("inputPaths");
if (StringUtils.isBlank(inputPaths)) {
throw new RuntimeException("empty inputPaths");
}
log.info("inputPaths: {}", inputPaths);
final String targetBaseDir = getTargetBaseDir(isLookupUrl);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf, isSparkSessionManaged, spark -> transformActions(inputPaths, targetBaseDir, spark));
}
private static void transformActions(String inputPaths, String targetBaseDir, SparkSession spark)
throws IOException {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
final String rawset = pathQ.pollLast();
final String actionSetDirectory = pathQ.pollLast();
final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset);
if (fs.exists(targetDirectory)) {
log.info("found target directory '{}", targetDirectory);
fs.delete(targetDirectory, true);
log.info("deleted target directory '{}", targetDirectory);
}
log.info("transforming actions from '{}' to '{}'", sourcePath, targetDirectory);
sc
.sequenceFile(sourcePath, Text.class, Text.class)
.map(a -> eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString()))
.map(TransformActions::doTransform)
.filter(Objects::nonNull)
.mapToPair(
a -> new Tuple2<>(a.getClazz().toString(), OBJECT_MAPPER.writeValueAsString(a)))
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
.saveAsNewAPIHadoopFile(
targetDirectory.toString(),
Text.class,
Text.class,
SequenceFileOutputFormat.class,
sc.hadoopConfiguration());
}
}
private static AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa)
throws InvalidProtocolBufferException {
// dedup similarity relations had empty target value, don't migrate them
if (aa.getTargetValue().length == 0) {
return null;
}
final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue());
final Oaf oaf = ProtoConverter.convert(proto_oaf);
switch (proto_oaf.getKind()) {
case entity:
switch (proto_oaf.getEntity().getType()) {
case datasource:
return new AtomicAction<>(Datasource.class, (Datasource) oaf);
case organization:
return new AtomicAction<>(Organization.class, (Organization) oaf);
case project:
return new AtomicAction<>(Project.class, (Project) oaf);
case result:
final String resulttypeid = proto_oaf
.getEntity()
.getResult()
.getMetadata()
.getResulttype()
.getClassid();
switch (resulttypeid) {
case "publication":
return new AtomicAction<>(Publication.class, (Publication) oaf);
case "software":
return new AtomicAction<>(Software.class, (Software) oaf);
case "other":
return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf);
case "dataset":
return new AtomicAction<>(Dataset.class, (Dataset) oaf);
default:
// can be an update, where the resulttype is not specified
return new AtomicAction<>(Result.class, (Result) oaf);
}
default:
throw new IllegalArgumentException(
"invalid entity type: " + proto_oaf.getEntity().getType());
}
case relation:
return new AtomicAction<>(Relation.class, (Relation) oaf);
default:
throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind());
}
}
private static String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
return isLookUp.getResourceProfileByQuery(XQUERY);
}
}

View File

@ -5,12 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
import java.io.IOException; import java.io.IOException;
import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.function.BiFunction; import java.util.function.BiFunction;
import java.util.function.Function; import java.util.function.Function;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -68,6 +68,12 @@ public class PromoteActionPayloadForGraphTableJob {
MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase()); MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
logger.info("strategy: {}", strategy); logger.info("strategy: {}", strategy);
Boolean shouldGroupById = Optional
.ofNullable(parser.get("shouldGroupById"))
.map(Boolean::valueOf)
.orElse(true);
logger.info("shouldGroupById: {}", shouldGroupById);
Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName); Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
Class<? extends Oaf> actionPayloadClazz = (Class<? extends Oaf>) Class.forName(actionPayloadClassName); Class<? extends Oaf> actionPayloadClazz = (Class<? extends Oaf>) Class.forName(actionPayloadClassName);
@ -89,7 +95,8 @@ public class PromoteActionPayloadForGraphTableJob {
outputGraphTablePath, outputGraphTablePath,
strategy, strategy,
rowClazz, rowClazz,
actionPayloadClazz); actionPayloadClazz,
shouldGroupById);
}); });
} }
@ -115,12 +122,12 @@ public class PromoteActionPayloadForGraphTableJob {
String outputGraphTablePath, String outputGraphTablePath,
MergeAndGet.Strategy strategy, MergeAndGet.Strategy strategy,
Class<G> rowClazz, Class<G> rowClazz,
Class<A> actionPayloadClazz) { Class<A> actionPayloadClazz, Boolean shouldGroupById) {
Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz); Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz);
Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz); Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz);
Dataset<G> result = promoteActionPayloadForGraphTable( Dataset<G> result = promoteActionPayloadForGraphTable(
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz) rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz, shouldGroupById)
.map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz)); .map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz));
saveGraphTable(result, outputGraphTablePath); saveGraphTable(result, outputGraphTablePath);
@ -153,9 +160,9 @@ public class PromoteActionPayloadForGraphTableJob {
private static String extractPayload(Row value) { private static String extractPayload(Row value) {
try { try {
return value.<String> getAs("payload"); return value.getAs("payload");
} catch (IllegalArgumentException | ClassCastException e) { } catch (IllegalArgumentException | ClassCastException e) {
logger.error("cannot extract payload from action: {}", value.toString()); logger.error("cannot extract payload from action: {}", value);
throw e; throw e;
} }
} }
@ -174,7 +181,8 @@ public class PromoteActionPayloadForGraphTableJob {
Dataset<A> actionPayloadDS, Dataset<A> actionPayloadDS,
MergeAndGet.Strategy strategy, MergeAndGet.Strategy strategy,
Class<G> rowClazz, Class<G> rowClazz,
Class<A> actionPayloadClazz) { Class<A> actionPayloadClazz,
Boolean shouldGroupById) {
logger logger
.info( .info(
"Promoting action payload for graph table: payload={}, table={}", "Promoting action payload for graph table: payload={}, table={}",
@ -186,7 +194,7 @@ public class PromoteActionPayloadForGraphTableJob {
SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy); SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy);
SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy); SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy);
SerializableSupplier<G> zeroFn = zeroFn(rowClazz); SerializableSupplier<G> zeroFn = zeroFn(rowClazz);
SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource; SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget;
Dataset<G> joinedAndMerged = PromoteActionPayloadFunctions Dataset<G> joinedAndMerged = PromoteActionPayloadFunctions
.joinGraphTableWithActionPayloadAndMerge( .joinGraphTableWithActionPayloadAndMerge(
@ -198,9 +206,13 @@ public class PromoteActionPayloadForGraphTableJob {
rowClazz, rowClazz,
actionPayloadClazz); actionPayloadClazz);
return PromoteActionPayloadFunctions if (shouldGroupById) {
.groupGraphTableByIdAndMerge( return PromoteActionPayloadFunctions
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz); .groupGraphTableByIdAndMerge(
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
} else {
return joinedAndMerged;
}
} }
private static <T extends Oaf> SerializableSupplier<T> zeroFn(Class<T> clazz) { private static <T extends Oaf> SerializableSupplier<T> zeroFn(Class<T> clazz) {
@ -226,12 +238,13 @@ public class PromoteActionPayloadForGraphTableJob {
} }
} }
private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSource() { private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSourceAndTarget() {
return t -> { return t -> {
if (isSubClass(t, Relation.class)) { if (isSubClass(t, Relation.class)) {
return Objects.nonNull(((Relation) t).getSource()); final Relation rel = (Relation) t;
return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget());
} }
return Objects.nonNull(((OafEntity) t).getId()); return StringUtils.isNotBlank(((OafEntity) t).getId());
}; };
} }

View File

@ -112,6 +112,7 @@ public class PromoteActionPayloadFunctions {
Class<G> rowClazz) { Class<G> rowClazz) {
TypedColumn<G, G> aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); TypedColumn<G, G> aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn();
return rowDS return rowDS
.filter((FilterFunction<G>) o -> isNotZeroFn.get().apply(o))
.groupByKey((MapFunction<G, String>) x -> rowIdFn.get().apply(x), Encoders.STRING()) .groupByKey((MapFunction<G, String>) x -> rowIdFn.get().apply(x), Encoders.STRING())
.agg(aggregator) .agg(aggregator)
.map((MapFunction<Tuple2<String, G>, G>) Tuple2::_2, Encoders.kryo(rowClazz)); .map((MapFunction<Tuple2<String, G>, G>) Tuple2::_2, Encoders.kryo(rowClazz));

View File

@ -1,56 +0,0 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "is",
"paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "sn",
"paramLongName": "sourceNameNode",
"paramDescription": "nameNode of the source cluster",
"paramRequired": true
},
{
"paramName": "tn",
"paramLongName": "targetNameNode",
"paramDescription": "namoNode of the target cluster",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDirectory",
"paramDescription": "working directory",
"paramRequired": true
},
{
"paramName": "nm",
"paramLongName": "distcp_num_maps",
"paramDescription": "maximum number of map tasks used in the distcp process",
"paramRequired": true
},
{
"paramName": "mm",
"paramLongName": "distcp_memory_mb",
"paramDescription": "memory for distcp action copying actionsets from remote cluster",
"paramRequired": true
},
{
"paramName": "tt",
"paramLongName": "distcp_task_timeout",
"paramDescription": "timeout for distcp copying actions from remote cluster",
"paramRequired": true
},
{
"paramName": "tr",
"paramLongName": "transform_only",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
}
]

View File

@ -1,20 +0,0 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "is",
"paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "inputPaths",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
}
]

View File

@ -40,5 +40,11 @@
"paramLongName": "mergeAndGetStrategy", "paramLongName": "mergeAndGetStrategy",
"paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET", "paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET",
"paramRequired": true "paramRequired": true
},
{
"paramName": "sgid",
"paramLongName": "shouldGroupById",
"paramDescription": "indicates whether the promotion operation should group objects in the graph by id or not",
"paramRequired": false
} }
] ]

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name> <name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description> <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property> </property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -111,6 +115,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg> <arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="DecisionPromoteResultActionPayloadForDatasetTable"/> <ok to="DecisionPromoteResultActionPayloadForDatasetTable"/>
<error to="Kill"/> <error to="Kill"/>
@ -162,6 +167,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg> <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -56,6 +56,11 @@
<name>mergeAndGetStrategy</name> <name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description> <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property> </property>
<property>
<name>shouldGroupById</name>
<value>false</value>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>sourceNN</name>
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/applicationHistory</value>
</property>
</configuration>

View File

@ -1,138 +0,0 @@
<workflow-app xmlns='uri:oozie:workflow:0.5' name='migrate_actions'>
<parameters>
<property>
<name>sourceNN</name>
<description>the source name node</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the isLookup service endpoint</description>
</property>
<property>
<name>workingDirectory</name>
<description>working directory</description>
</property>
<property>
<name>distcp_memory_mb</name>
<value>6144</value>
<description>memory for distcp copying actionsets from remote cluster</description>
</property>
<property>
<name>distcp_task_timeout</name>
<value>60000000</value>
<description>timeout for distcp copying actions from remote cluster</description>
</property>
<property>
<name>distcp_num_maps</name>
<value>1</value>
<description>mmaximum number of map tasks used in the distcp process</description>
</property>
<property>
<name>transform_only</name>
<description>activate tranform-only mode. Only apply transformation step</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="migrate_actionsets"/>
<action name="migrate_actionsets">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.migration.MigrateActionSet</main-class>
<java-opt>-Dmapred.task.timeout=${distcp_task_timeout}</java-opt>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--sourceNameNode</arg><arg>${sourceNN}</arg>
<arg>--targetNameNode</arg><arg>${nameNode}</arg>
<arg>--workingDirectory</arg><arg>${workingDirectory}</arg>
<arg>--distcp_num_maps</arg><arg>${distcp_num_maps}</arg>
<arg>--distcp_memory_mb</arg><arg>${distcp_memory_mb}</arg>
<arg>--distcp_task_timeout</arg><arg>${distcp_task_timeout}</arg>
<arg>--transform_only</arg><arg>${transform_only}</arg>
<capture-output/>
</java>
<ok to="transform_actions" />
<error to="fail" />
</action>
<action name="transform_actions">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>transform_actions</name>
<class>eu.dnetlib.dhp.actionmanager.migration.TransformActions</class>
<jar>dhp-actionmanager-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--inputPaths</arg><arg>${wf:actionData('migrate_actionsets')['target_paths']}</arg>
</spark>
<ok to="end"/>
<error to="fail"/>
</action>
<kill name="fail">
<message>migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end" />
</workflow-app>

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name> <name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description> <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property> </property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -110,6 +114,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg> <arg>--outputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="DecisionPromoteResultActionPayloadForOtherResearchProductTable"/> <ok to="DecisionPromoteResultActionPayloadForOtherResearchProductTable"/>
<error to="Kill"/> <error to="Kill"/>
@ -161,6 +166,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/otherresearchproduct</arg> <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/otherresearchproduct</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name> <name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description> <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property> </property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -111,6 +115,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/publication</arg> <arg>--outputGraphTablePath</arg><arg>${workingDir}/publication</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="DecisionPromoteResultActionPayloadForPublicationTable"/> <ok to="DecisionPromoteResultActionPayloadForPublicationTable"/>
<error to="Kill"/> <error to="Kill"/>
@ -162,6 +167,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/publication</arg> <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/publication</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name> <name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description> <description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property> </property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -110,6 +114,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/software</arg> <arg>--outputGraphTablePath</arg><arg>${workingDir}/software</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="DecisionPromoteResultActionPayloadForSoftwareTable"/> <ok to="DecisionPromoteResultActionPayloadForSoftwareTable"/>
<error to="Kill"/> <error to="Kill"/>
@ -161,6 +166,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg> <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/software</arg> <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/software</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg> <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -101,7 +101,9 @@ public class PromoteActionPayloadForGraphTableJobTest {
"-outputGraphTablePath", "-outputGraphTablePath",
"", "",
"-mergeAndGetStrategy", "-mergeAndGetStrategy",
MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name() MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name(),
"--shouldGroupById",
"true"
})); }));
// then // then
@ -141,7 +143,9 @@ public class PromoteActionPayloadForGraphTableJobTest {
"-outputGraphTablePath", "-outputGraphTablePath",
outputGraphTableDir.toString(), outputGraphTableDir.toString(),
"-mergeAndGetStrategy", "-mergeAndGetStrategy",
strategy.name() strategy.name(),
"--shouldGroupById",
"true"
}); });
// then // then

View File

@ -1,29 +1,27 @@
Description of the Module Description of the Module
-------------------------- --------------------------
This module defines a **collector worker application** that runs on Hadoop. This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
of each MDStore.
It is responsible for harvesting metadata using different plugins. ## Metadata collection
The collector worker uses a message queue to inform the progress The **metadata collection workflow** is responsible for harvesting metadata records from different protocols and responding to
of the harvesting action (using a message queue for sending **ONGOING** messages) furthermore, different formats and to store them as on HDFS so that they can be further processed.
It gives, at the end of the job, some information about the status
of the collection i.e Number of records collected(using a message queue for sending **REPORT** messages).
To work the collection worker need some parameter like: ### Collector Plugins
* **hdfsPath**: the path where storing the sequential file Different protocols are managed by dedicated Collector plugins, i.e. java programs implementing a defined interface:
* **apidescriptor**: the JSON encoding of the API Descriptor
* **namenode**: the Name Node URI
* **userHDFS**: the user wich create the hdfs seq file
* **rabbitUser**: the user to connect with RabbitMq for messaging
* **rabbitPassWord**: the password to connect with RabbitMq for messaging
* **rabbitHost**: the host of the RabbitMq server
* **rabbitOngoingQueue**: the name of the ongoing queue
* **rabbitReportQueue**: the name of the report queue
* **workflowId**: the identifier of the dnet Workflow
##Plugins ```eu.dnetlib.dhp.collection.plugin.CollectorPlugin```
* OAI Plugin
The list of the supported plugins:
* OAI Plugin: collects from OAI-PMH compatible endpoints
* MDStore plugin: collects from a given D-Net MetadataStore, (identified by moogodb URI, dbName, MDStoreID)
* MDStore dump plugin: collects from an MDStore dump stored on the HDFS location indicated by the `path` parameter
# Transformation Plugins
TODO
## Usage
TODO

View File

@ -7,10 +7,44 @@
<version>1.2.4-SNAPSHOT</version> <version>1.2.4-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-aggregation</artifactId> <artifactId>dhp-aggregation</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies> <dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_2.11</artifactId>
@ -24,19 +58,7 @@
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId> <artifactId>dhp-common</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>com.sun.xml.bind</groupId>
<artifactId>jaxb-core</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
</dependency>
<dependency> <dependency>
<groupId>net.sf.saxon</groupId> <groupId>net.sf.saxon</groupId>
@ -57,6 +79,11 @@
<artifactId>jaxen</artifactId> <artifactId>jaxen</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv --> <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
@ -77,8 +104,11 @@
<artifactId>commons-compress</artifactId> <artifactId>commons-compress</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -75,7 +75,6 @@ public class CollectAndSave implements Serializable {
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class)) .union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class)) .union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
;
} }
private static void removeOutputDir(SparkSession spark, String path) { private static void removeOutputDir(SparkSession spark, String path) {

View File

@ -36,7 +36,7 @@ import scala.Tuple2;
*/ */
public class SparkAtomicActionScoreJob implements Serializable { public class SparkAtomicActionScoreJob implements Serializable {
private static String DOI = "doi"; private static final String DOI = "doi";
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class); private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

View File

@ -0,0 +1,86 @@
package eu.dnetlib.dhp.actionmanager.datacite
import org.apache.commons.io.IOUtils
import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest}
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClients
import java.io.IOException
abstract class AbstractRestClient extends Iterator[String]{
var buffer: List[String] = List()
var current_index:Int = 0
var scroll_value: Option[String] = None
var complete:Boolean = false
def extractInfo(input: String): Unit
protected def getBufferData(): Unit
def doHTTPGETRequest(url:String): String = {
val httpGet = new HttpGet(url)
doHTTPRequest(httpGet)
}
def doHTTPPOSTRequest(url:String, json:String): String = {
val httpPost = new HttpPost(url)
if (json != null) {
val entity = new StringEntity(json)
httpPost.setEntity(entity)
httpPost.setHeader("Accept", "application/json")
httpPost.setHeader("Content-type", "application/json")
}
doHTTPRequest(httpPost)
}
def hasNext: Boolean = {
buffer.nonEmpty && current_index < buffer.size
}
override def next(): String = {
val next_item:String = buffer(current_index)
current_index = current_index + 1
if (current_index == buffer.size)
getBufferData()
next_item
}
private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={
val client = HttpClients.createDefault
var tries = 4
try {
while (tries > 0) {
println(s"requesting ${r.getURI}")
val response = client.execute(r)
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
}
else
return IOUtils.toString(response.getEntity.getContent)
}
""
} catch {
case e: Throwable =>
throw new RuntimeException("Error on executing request ", e)
} finally try client.close()
catch {
case e: IOException =>
throw new RuntimeException("Unable to close client ", e)
}
}
getBufferData()
}

View File

@ -0,0 +1,31 @@
package eu.dnetlib.dhp.actionmanager.datacite
import org.json4s.{DefaultFormats, JValue}
import org.json4s.jackson.JsonMethods.{compact, parse, render}
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
override def extractInfo(input: String): Unit = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
buffer = (json \ "data").extract[List[JValue]].map(s => compact(render(s)))
val next_url = (json \ "links" \ "next").extractOrElse[String](null)
scroll_value = if (next_url != null && next_url.nonEmpty) Some(next_url) else None
if (scroll_value.isEmpty)
complete = true
current_index = 0
}
def get_url():String ={
val to = if (until> 0) s"$until" else "*"
s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]"
}
override def getBufferData(): Unit = {
if (!complete) {
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
extractInfo(response)
}
}
}

View File

@ -0,0 +1,607 @@
package eu.dnetlib.dhp.actionmanager.datacite
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import java.nio.charset.CodingErrorAction
import java.text.SimpleDateFormat
import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.regex.Pattern
import java.util.{Date, Locale}
import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
import scala.language.postfixOps
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
object DataciteToOAFTransformation {
val REL_TYPE_VALUE:String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val subRelTypeMapping: Map[String,(String,String)] = Map(
"References" ->("IsReferencedBy","relationship"),
"IsSupplementTo" ->("IsSupplementedBy","supplement"),
"IsPartOf" ->("HasPart","part"),
"HasPart" ->("IsPartOf","part"),
"IsVersionOf" ->("HasVersion","version"),
"HasVersion" ->("IsVersionOf","version"),
"IsIdenticalTo" ->("IsIdenticalTo","relationship"),
"IsPreviousVersionOf" ->("IsNewVersionOf","version"),
"IsContinuedBy" ->("Continues","relationship"),
"Continues" ->("IsContinuedBy","relationship"),
"IsNewVersionOf" ->("IsPreviousVersionOf","version"),
"IsSupplementedBy" ->("IsSupplementTo","supplement"),
"IsDocumentedBy" ->("Documents","relationship"),
"IsSourceOf" ->("IsDerivedFrom","relationship"),
"Cites" ->("IsCitedBy","citation"),
"IsCitedBy" ->("Cites","citation"),
"IsDerivedFrom" ->("IsSourceOf","relationship"),
"IsVariantFormOf" ->("IsDerivedFrom","version"),
"IsReferencedBy" ->("References","relationship"),
"IsObsoletedBy" ->("IsNewVersionOf","version"),
"Reviews" ->("IsReviewedBy","review"),
"Documents" ->("IsDocumentedBy","relationship"),
"IsCompiledBy" ->("Compiles","relationship"),
"Compiles" ->("IsCompiledBy","relationship"),
"IsReviewedBy" ->("Reviews","review")
)
implicit val codec: Codec = Codec("UTF-8")
codec.onMalformedInput(CodingErrorAction.REPLACE)
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val j_filter: List[String] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
s.lines.toList
}
val mapper = new ObjectMapper()
val unknown_repository: HostedByMapType = HostedByMapType(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
val dataInfo: DataInfo = generateDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite")
val hostedByMap: Map[String, HostedByMapType] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(s)
json.extract[Map[String, HostedByMapType]]
}
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
def filter_json(json: String): Boolean = {
j_filter.exists(f => json.contains(f))
}
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: OafDataset =>
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
a.setClazz(classOf[OafDataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case software: Software =>
val a: AtomicAction[Software] = new AtomicAction[Software]
a.setClazz(classOf[Software])
a.setPayload(software)
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
case orp: OtherResearchProduct =>
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
a.setClazz(classOf[OtherResearchProduct])
a.setPayload(orp)
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
td.isAfter(dt)
}
def extract_date(input: String): Option[String] = {
val d = Date_regex.map(pattern => {
val matcher = pattern.matcher(input)
if (matcher.find())
matcher.group(0)
else
null
}
).find(s => s != null)
if (d.isDefined) {
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
try {
return Some(LocalDate.parse(a_date, df_en).toString)
} catch {
case _: Throwable => try {
return Some(LocalDate.parse(a_date, df_it).toString)
} catch {
case _: Throwable =>
return None
}
}
}
d
}
def fix_thai_date(input:String, format:String) :String = {
try {
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
LocalDate.from(d).toString
} catch {
case _: Throwable => ""
}
}
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
if (schemaOrg != null && schemaOrg.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
null
}
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (typeQualifiers == null)
return null
val i = new Instance
i.setInstancetype(typeQualifiers._1)
typeQualifiers._2.getClassname match {
case "dataset" =>
val r = new OafDataset
r.setInstance(List(i).asJava)
return r
case "publication" =>
val r = new Publication
r.setInstance(List(i).asJava)
return r
case "software" =>
val r = new Software
r.setInstance(List(i).asJava)
return r
case "other" =>
val r = new OtherResearchProduct
r.setInstance(List(i).asJava)
return r
}
null
}
def available_date(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val l: List[String] = for {
JObject(dates) <- json \\ "dates"
JField("dateType", JString(dateTypes)) <- dates
} yield dateTypes
l.exists(p => p.equalsIgnoreCase("available"))
}
/**
* As describe in ticket #6377
* when the result come from figshare we need to remove subject
* and set Access rights OPEN.
*
* @param r
*/
def fix_figshare(r: Result): Unit = {
if (r.getInstance() != null) {
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
val l: List[StructuredProperty] = List()
r.setSubject(l.asJava)
}
}
}
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
}
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
OafMapperUtils.structuredProperty(dt, q, null)
}
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
val r = new Relation
r.setSource(sourceId)
r.setTarget(targetId)
r.setRelType(ModelConstants.RESULT_PROJECT)
r.setRelClass(relClass)
r.setSubRelType(ModelConstants.OUTCOME)
r.setCollectedfrom(List(cf).asJava)
r.setDataInfo(di)
r
}
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
if (match_pattern.isDefined) {
val m = match_pattern.get._1
val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
)
}
else
List()
}
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
if (filter_json(input))
return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
val doi = (json \ "attributes" \ "doi").extract[String]
if (doi.isEmpty)
return List()
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (result == null)
return List()
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
result.setPid(List(pid).asJava)
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
result.setOriginalId(List(doi).asJava)
val d = new Date(dateOfCollection * 1000)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
result.setDateofcollection(ISO8601FORMAT.format(d))
result.setDateoftransformation(ISO8601FORMAT.format(ts))
result.setDataInfo(dataInfo)
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
val authors = creators.zipWithIndex.map { case (c, idx) =>
val a = new Author
a.setFullname(c.name.orNull)
a.setName(c.givenName.orNull)
a.setSurname(c.familyName.orNull)
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
a.setPid(c.nameIdentifiers.get.map(ni => {
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
}
else
null
}
)
.asJava)
}
if (c.affiliation.isDefined)
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
a.setRank(idx + 1)
a
}
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
if (t.titleType.isEmpty) {
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
} else {
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
}
}).asJava)
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
return List()
result.setAuthor(authors.asJava)
val dates = (json \\ "dates").extract[List[DateType]]
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
val i_date = dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
.map(d => extract_date(d.date.get))
val a_date: Option[String] = dates
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
.map(d => extract_date(d.date.get))
.find(d => d != null && d.isDefined)
.map(d => d.get)
if (a_date.isDefined) {
if(doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
}
else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
}
}
else if (publication_year != null) {
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
} else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
}
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined)
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
.filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2)).asJava)
val subjects = (json \\ "subjects").extract[List[SubjectType]]
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
.map(s =>
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
).asJava)
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
result.setDescription(
descriptions
.filter(d => d.description.isDefined).
map(d =>
OafMapperUtils.field(d.description.get, null)
).filter(s => s != null).asJava)
val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null))
val language: String = (json \\ "language").extractOrElse[String](null)
if (language != null)
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
val instance = result.getInstance().get(0)
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
val accessRights: List[String] = for {
JObject(rightsList) <- json \\ "rightsList"
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
val aRights: Option[AccessRight] = accessRights.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
}).find(q => q != null).map(q => {
val a = new AccessRight
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
a.setSchemeid(q.getSchemeid)
a.setSchemename(q.getSchemename)
a
})
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
if (client.isDefined) {
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier)
instance.setPid(result.getPid)
val license = accessRights
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null))
}
val awardUris: List[String] = for {
JObject(fundingReferences) <- json \\ "fundingReferences"
JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri
var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result)
result.setId(IdentifierFactory.createIdentifier(result))
if (result.getId == null)
return List()
if (exportLinks) {
val rels: List[RelatedIdentifierType] = for {
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
JField("relationType", JString(relationType)) <- relIdentifier
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
relations = relations ::: generateRelations(rels,result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
}
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
}
else
List(result)
}
private def generateRelations(rels: List[RelatedIdentifierType], id:String, date:String):List[Relation] = {
rels
.filter(r =>
subRelTypeMapping.contains(r.relationType) && (
r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
)
.map(r => {
val rel = new Relation
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.setDataInfo(dataInfo)
val subRelType = subRelTypeMapping(r.relationType)._2
rel.setRelType(REL_TYPE_VALUE)
rel.setSubRelType(subRelType)
rel.setRelClass(r.relationType)
val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
rel.setSource(id)
rel.setTarget(s"unresolved::${r.relatedIdentifier}::${r.relatedIdentifierType}")
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue)(collection.breakOut)
rel
})(collection breakOut)
}
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER)
di
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|$b::${DHPUtils.md5(a)}"
}
}

View File

@ -0,0 +1,41 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object ExportActionSetJobNode {
val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(ExportActionSetJobNode.getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
spark.read.load(sourcePath).as[Oaf]
.map(o =>DataciteToOAFTransformation.toActionSet(o))
.filter(o => o!= null)
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object FilterCrossrefEntitiesSpark {
val log: Logger = LoggerFactory.getLogger(getClass.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
log.info("sourcePath: {}", sourcePath)
val targetPath = parser.get("targetPath")
log.info("targetPath: {}", targetPath)
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val resEncoder: Encoder[Result] = Encoders.kryo[Result]
val d:Dataset[Oaf]= spark.read.load(sourcePath).as[Oaf]
d.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]).write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -0,0 +1,48 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
import spark.implicits._
spark.read.load(sourcePath).as[DataciteType]
.filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
.filter(d => d != null)
.write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -0,0 +1,186 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.apache.hadoop.hdfs.DistributedFileSystem
import org.apache.hadoop.io.{IntWritable, SequenceFile, Text}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import org.apache.spark.sql.functions.max
import org.slf4j.{Logger, LoggerFactory}
import java.time.format.DateTimeFormatter._
import java.time.{LocalDate, LocalDateTime, ZoneOffset}
import scala.io.Source
object ImportDatacite {
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
def convertAPIStringToDataciteItem(input: String): DataciteType = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val doi = (json \ "attributes" \ "doi").extract[String].toLowerCase
val isActive = (json \ "attributes" \ "isActive").extract[Boolean]
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val hdfsuri = parser.get("namenode")
log.info(s"namenode is $hdfsuri")
val targetPath = parser.get("targetPath")
log.info(s"targetPath is $targetPath")
val dataciteDump = parser.get("dataciteDumpPath")
log.info(s"dataciteDump is $dataciteDump")
val hdfsTargetPath = new Path(targetPath)
log.info(s"hdfsTargetPath is $hdfsTargetPath")
val bs = if (parser.get("blocksize") == null) 100 else parser.get("blocksize").toInt
val spkipImport = parser.get("skipImport")
log.info(s"skipImport is $spkipImport")
val spark: SparkSession = SparkSession.builder()
.appName(ImportDatacite.getClass.getSimpleName)
.master(master)
.getOrCreate()
// ====== Init HDFS File System Object
val conf = new Configuration
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsuri)
// Because of Maven
conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName)
conf.set("fs.file.impl", classOf[LocalFileSystem].getName)
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("ERROR")
import spark.implicits._
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
override def zero: DataciteType = null
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
if (b == null)
return a
if (a == null)
return b
if (a.timestamp > b.timestamp) {
return a
}
b
}
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
reduce(a, b)
}
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
override def finish(reduction: DataciteType): DataciteType = reduction
}
val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
val ts = dump.select(max("timestamp")).first().getLong(0)
println(s"last Timestamp is $ts")
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
println(s"Imported from Datacite API $cnt documents")
if (cnt > 0) {
val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
.map(s => s._2.toString)
.map(s => convertAPIStringToDataciteItem(s))
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
val ds: Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType]
dump
.union(ds)
.groupByKey(_.doi)
.agg(dataciteAggregator.toColumn)
.map(s => s._2)
.repartition(4000)
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
val fs = FileSystem.get(sc.hadoopConfiguration)
fs.delete(new Path(s"$dataciteDump"), true)
fs.rename(new Path(s"${dataciteDump}_updated"), new Path(s"$dataciteDump"))
}
}
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
var from:Long = timestamp * 1000
val delta:Long = 50000000L
var client: DataciteAPIImporter = null
val now :Long =System.currentTimeMillis()
var i = 0
try {
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
try {
var start: Long = System.currentTimeMillis
while (from < now) {
client = new DataciteAPIImporter(from, bs, from + delta)
var end: Long = 0
val key: IntWritable = new IntWritable(i)
val value: Text = new Text
while (client.hasNext) {
key.set({
i += 1;
i - 1
})
value.set(client.next())
writer.append(key, value)
writer.hflush()
if (i % 1000 == 0) {
end = System.currentTimeMillis
val time = (end - start) / 1000.0F
println(s"Imported $i in $time seconds")
start = System.currentTimeMillis
}
}
println(s"updating from value: $from -> ${from+delta}")
from = from + delta
}
} catch {
case e: Throwable =>
println("Error", e)
} finally if (writer != null) writer.close()
}
catch {
case e: Throwable =>
log.error("Error", e)
}
i
}
}

View File

@ -143,24 +143,8 @@ public class PrepareProgramme {
JavaRDD<CSVProgramme> h2020Programmes = programme JavaRDD<CSVProgramme> h2020Programmes = programme
.toJavaRDD() .toJavaRDD()
.filter(p -> p.getFrameworkProgramme().trim().equalsIgnoreCase("H2020"))
.mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme)) .mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme))
.reduceByKey((a, b) -> { .reduceByKey(PrepareProgramme::groupProgrammeByCode)
if (!a.getLanguage().equals("en")) {
if (b.getLanguage().equalsIgnoreCase("en")) {
a.setTitle(b.getTitle());
a.setLanguage(b.getLanguage());
}
}
if (StringUtils.isEmpty(a.getShortTitle())) {
if (!StringUtils.isEmpty(b.getShortTitle())) {
a.setShortTitle(b.getShortTitle());
}
}
return a;
})
.map(p -> { .map(p -> {
CSVProgramme csvProgramme = p._2(); CSVProgramme csvProgramme = p._2();
String programmeTitle = csvProgramme.getTitle().trim(); String programmeTitle = csvProgramme.getTitle().trim();
@ -177,20 +161,31 @@ public class PrepareProgramme {
return csvProgramme; return csvProgramme;
}); });
// prepareClassification(h2020Programmes); final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
JavaRDD<CSVProgramme> rdd = jsc.parallelize(prepareClassification(h2020Programmes), 1); JavaRDD<CSVProgramme> rdd = jsc.parallelize(prepareClassification(h2020Programmes), 1);
rdd rdd
.map(csvProgramme -> { .map(OBJECT_MAPPER::writeValueAsString)
String tmp = OBJECT_MAPPER.writeValueAsString(csvProgramme);
return tmp;
})
.saveAsTextFile(outputPath); .saveAsTextFile(outputPath);
} }
private static CSVProgramme groupProgrammeByCode(CSVProgramme a, CSVProgramme b) {
if (!a.getLanguage().equals("en")) {
if (b.getLanguage().equalsIgnoreCase("en")) {
a.setTitle(b.getTitle());
a.setLanguage(b.getLanguage());
}
}
if (StringUtils.isEmpty(a.getShortTitle())) {
if (!StringUtils.isEmpty(b.getShortTitle())) {
a.setShortTitle(b.getShortTitle());
}
}
return a;
}
private static List<CSVProgramme> prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) { private static List<CSVProgramme> prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
Object[] codedescription = h2020Programmes Object[] codedescription = h2020Programmes
.map( .map(
@ -241,15 +236,15 @@ public class PrepareProgramme {
if (!ent.contains("Euratom")) { if (!ent.contains("Euratom")) {
String parent; String parent;
String tmp_key = tmp[0] + "."; String tmpKey = tmp[0] + ".";
for (int i = 1; i < tmp.length - 1; i++) { for (int i = 1; i < tmp.length - 1; i++) {
tmp_key += tmp[i] + "."; tmpKey += tmp[i] + ".";
parent = map.get(tmp_key)._1().toLowerCase().trim(); parent = map.get(tmpKey)._1().toLowerCase().trim();
if (parent.contains("|")) { if (parent.contains("|")) {
parent = parent.substring(parent.lastIndexOf("|") + 1).trim(); parent = parent.substring(parent.lastIndexOf("|") + 1).trim();
} }
if (current.trim().length() > parent.length() if (current.trim().length() > parent.length()
&& current.toLowerCase().trim().substring(0, parent.length()).equals(parent)) { && current.toLowerCase().trim().startsWith(parent)) {
current = current.substring(parent.length() + 1); current = current.substring(parent.length() + 1);
if (current.trim().charAt(0) == '-' || current.trim().charAt(0) == '') { if (current.trim().charAt(0) == '-' || current.trim().charAt(0) == '') {
current = current.trim().substring(1).trim(); current = current.trim().substring(1).trim();

View File

@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme;
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject; import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
@ -32,7 +31,6 @@ public class PrepareProjects {
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class); private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final HashMap<String, CSVProgramme> programmeMap = new HashMap<>();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -93,7 +91,7 @@ public class PrepareProjects {
} }
private static FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject> getTuple2CSVProjectFlatMapFunction() { private static FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
return (FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject>) value -> { return value -> {
Optional<CSVProject> csvProject = Optional.ofNullable(value._2()); Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
List<CSVProject> csvProjectList = new ArrayList<>(); List<CSVProject> csvProjectList = new ArrayList<>();
if (csvProject.isPresent()) { if (csvProject.isPresent()) {

View File

@ -120,7 +120,6 @@ public class SparkAtomicActionJob {
.map((MapFunction<Tuple2<CSVProject, CSVProgramme>, Project>) c -> { .map((MapFunction<Tuple2<CSVProject, CSVProgramme>, Project>) c -> {
CSVProject csvProject = c._1(); CSVProject csvProject = c._1();
Optional<CSVProgramme> ocsvProgramme = Optional.ofNullable(c._2());
return Optional return Optional
.ofNullable(c._2()) .ofNullable(c._2())
@ -135,9 +134,9 @@ public class SparkAtomicActionJob {
H2020Programme pm = new H2020Programme(); H2020Programme pm = new H2020Programme();
H2020Classification h2020classification = new H2020Classification(); H2020Classification h2020classification = new H2020Classification();
pm.setCode(csvProject.getProgramme()); pm.setCode(csvProject.getProgramme());
h2020classification.setClassification(ocsvProgramme.get().getClassification()); h2020classification.setClassification(csvProgramme.getClassification());
h2020classification.setH2020Programme(pm); h2020classification.setH2020Programme(pm);
setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short()); setLevelsandProgramme(h2020classification, csvProgramme.getClassification_short());
// setProgramme(h2020classification, ocsvProgramme.get().getClassification()); // setProgramme(h2020classification, ocsvProgramme.get().getClassification());
pp.setH2020classification(Arrays.asList(h2020classification)); pp.setH2020classification(Arrays.asList(h2020classification));
@ -145,10 +144,11 @@ public class SparkAtomicActionJob {
}) })
.orElse(null); .orElse(null);
}, Encoders.bean(Project.class)); }, Encoders.bean(Project.class))
.filter(Objects::nonNull);
aaproject aaproject
.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code"))) .joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")), "left")
.map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> { .map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> {
Optional<EXCELTopic> op = Optional.ofNullable(p._2()); Optional<EXCELTopic> op = Optional.ofNullable(p._2());
Project rp = p._1(); Project rp = p._1();

View File

@ -1,20 +0,0 @@
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
import java.util.LinkedList;
public class CollectorPluginErrorLogList extends LinkedList<String> {
private static final long serialVersionUID = -6925786561303289704L;
@Override
public String toString() {
String log = new String();
int index = 0;
for (String errorMessage : this) {
log += String.format("Retry #%s: %s / ", index++, errorMessage);
}
return log;
}
}

View File

@ -1,20 +0,0 @@
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
public class CollectorServiceException extends Exception {
private static final long serialVersionUID = 7523999812098059764L;
public CollectorServiceException(String string) {
super(string);
}
public CollectorServiceException(String string, Throwable exception) {
super(string, exception);
}
public CollectorServiceException(Throwable exception) {
super(exception);
}
}

View File

@ -1,240 +0,0 @@
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
import java.io.IOException;
import java.io.InputStream;
import java.net.*;
import java.security.GeneralSecurityException;
import java.security.cert.X509Certificate;
import java.util.List;
import java.util.Map;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* @author jochen, michele, andrea
*/
public class HttpConnector {
private static final Log log = LogFactory.getLog(HttpConnector.class);
private int maxNumberOfRetry = 6;
private int defaultDelay = 120; // seconds
private int readTimeOut = 120; // seconds
private String responseType = null;
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
public HttpConnector() {
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
}
/**
* Given the URL returns the content via HTTP GET
*
* @param requestUrl the URL
* @return the content of the downloaded resource
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
*/
public String getInputSource(final String requestUrl) throws CollectorServiceException {
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
}
/**
* Given the URL returns the content as a stream via HTTP GET
*
* @param requestUrl the URL
* @return the content of the downloaded resource as InputStream
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
*/
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
}
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber,
final CollectorPluginErrorLogList errorList)
throws CollectorServiceException {
try {
InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
try {
return IOUtils.toString(s);
} catch (IOException e) {
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
Thread.sleep(defaultDelay * 1000);
errorList.add(e.getMessage());
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
} finally {
IOUtils.closeQuietly(s);
}
} catch (InterruptedException e) {
throw new CollectorServiceException(e);
}
}
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
final CollectorPluginErrorLogList errorList)
throws CollectorServiceException {
if (retryNumber > maxNumberOfRetry) {
throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList);
}
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
try {
InputStream input = null;
try {
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
urlConn.setInstanceFollowRedirects(false);
urlConn.setReadTimeout(readTimeOut * 1000);
urlConn.addRequestProperty("User-Agent", userAgent);
if (log.isDebugEnabled()) {
logHeaderFields(urlConn);
}
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
log.warn("waiting and repeating request after " + retryAfter + " sec.");
Thread.sleep(retryAfter * 1000);
errorList.add("503 Service Unavailable");
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, errorList);
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM)
|| (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) {
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
log.debug("The requested url has been moved to " + newUrl);
errorList
.add(
String
.format(
"%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(),
newUrl));
urlConn.disconnect();
return attemptDownload(newUrl, retryNumber + 1, errorList);
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
log
.error(
String
.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
Thread.sleep(defaultDelay * 1000);
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, errorList);
} else {
input = urlConn.getInputStream();
responseType = urlConn.getContentType();
return input;
}
} catch (IOException e) {
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
Thread.sleep(defaultDelay * 1000);
errorList.add(e.getMessage());
return attemptDownload(requestUrl, retryNumber + 1, errorList);
}
} catch (InterruptedException e) {
throw new CollectorServiceException(e);
}
}
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
log.debug("StatusCode: " + urlConn.getResponseMessage());
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
if (e.getKey() != null) {
for (String v : e.getValue()) {
log.debug(" key: " + e.getKey() + " - value: " + v);
}
}
}
}
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
for (String key : headerMap.keySet()) {
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0)
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
return Integer
.parseInt(headerMap.get(key).get(0)) + 10;
}
}
return -1;
}
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
for (String key : headerMap.keySet()) {
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) {
return headerMap.get(key).get(0);
}
}
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
}
/**
* register for https scheme; this is a workaround and not intended for the use in trusted environments
*/
public void initTrustManager() {
final X509TrustManager tm = new X509TrustManager() {
@Override
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
}
@Override
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
try {
final SSLContext ctx = SSLContext.getInstance("TLS");
ctx.init(null, new TrustManager[] {
tm
}, null);
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
} catch (GeneralSecurityException e) {
log.fatal(e);
throw new IllegalStateException(e);
}
}
public int getMaxNumberOfRetry() {
return maxNumberOfRetry;
}
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
this.maxNumberOfRetry = maxNumberOfRetry;
}
public int getDefaultDelay() {
return defaultDelay;
}
public void setDefaultDelay(final int defaultDelay) {
this.defaultDelay = defaultDelay;
}
public int getReadTimeOut() {
return readTimeOut;
}
public void setReadTimeOut(final int readTimeOut) {
this.readTimeOut = readTimeOut;
}
public String getResponseType() {
return responseType;
}
}

View File

@ -7,14 +7,7 @@ import java.io.Serializable;
* The model for the programme csv file * The model for the programme csv file
*/ */
public class CSVProgramme implements Serializable { public class CSVProgramme implements Serializable {
private String parentProgramme;
private String frameworkProgramme;
private String startDate;
private String endDate;
private String objective;
private String subjects;
private String legalBasis;
private String call;
private String rcn; private String rcn;
private String code; private String code;
@ -80,67 +73,5 @@ public class CSVProgramme implements Serializable {
this.language = language; this.language = language;
} }
public String getParentProgramme() { //
return parentProgramme;
}
public void setParentProgramme(String parentProgramme) {
this.parentProgramme = parentProgramme;
}
public String getFrameworkProgramme() {
return frameworkProgramme;
}
public void setFrameworkProgramme(String frameworkProgramme) {
this.frameworkProgramme = frameworkProgramme;
}
public String getStartDate() {
return startDate;
}
public void setStartDate(String startDate) {
this.startDate = startDate;
}
public String getEndDate() {
return endDate;
}
public void setEndDate(String endDate) {
this.endDate = endDate;
}
public String getObjective() {
return objective;
}
public void setObjective(String objective) {
this.objective = objective;
}
public String getSubjects() {
return subjects;
}
public void setSubjects(String subjects) {
this.subjects = subjects;
}
public String getLegalBasis() {
return legalBasis;
}
public void setLegalBasis(String legalBasis) {
this.legalBasis = legalBasis;
}
public String getCall() {
return call;
}
public void setCall(String call) {
this.call = call;
}
} }

View File

@ -26,7 +26,6 @@ public class EXCELParser {
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException, throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
InvalidFormatException { InvalidFormatException {
// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL));
OPCPackage pkg = OPCPackage.open(file); OPCPackage pkg = OPCPackage.open(file);
XSSFWorkbook wb = new XSSFWorkbook(pkg); XSSFWorkbook wb = new XSSFWorkbook(pkg);
@ -58,7 +57,6 @@ public class EXCELParser {
for (int i = 0; i < headers.size(); i++) { for (int i = 0; i < headers.size(); i++) {
Cell cell = row.getCell(i); Cell cell = row.getCell(i);
String value = dataFormatter.formatCellValue(cell);
FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true); FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true);
} }

View File

@ -17,8 +17,8 @@ import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.HttpConnector2;
/** /**
* Applies the parsing of a csv file and writes the Serialization of it in hdfs * Applies the parsing of a csv file and writes the Serialization of it in hdfs
@ -28,7 +28,7 @@ public class ReadCSV implements Closeable {
private final Configuration conf; private final Configuration conf;
private final BufferedWriter writer; private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private String csvFile; private final String csvFile;
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -74,7 +74,7 @@ public class ReadCSV implements Closeable {
throws Exception { throws Exception {
this.conf = new Configuration(); this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode); this.conf.set("fs.defaultFS", hdfsNameNode);
HttpConnector httpConnector = new HttpConnector(); HttpConnector2 httpConnector = new HttpConnector2();
FileSystem fileSystem = FileSystem.get(this.conf); FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath); Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null; FSDataOutputStream fsDataOutputStream = null;
@ -85,7 +85,6 @@ public class ReadCSV implements Closeable {
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
this.csvFile = httpConnector.getInputSource(fileURL); this.csvFile = httpConnector.getInputSource(fileURL);
;
} }
protected void write(final Object p) { protected void write(final Object p) {

View File

@ -14,19 +14,18 @@ import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.HttpConnector2;
/** /**
* Applies the parsing of an excel file and writes the Serialization of it in hdfs * Applies the parsing of an excel file and writes the Serialization of it in hdfs
*/ */
public class ReadExcel implements Closeable { public class ReadExcel implements Closeable {
private static final Log log = LogFactory.getLog(ReadCSV.class); private static final Log log = LogFactory.getLog(ReadCSV.class);
private final Configuration conf; private final Configuration conf;
private final BufferedWriter writer; private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private InputStream excelFile; private final InputStream excelFile;
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -73,7 +72,7 @@ public class ReadExcel implements Closeable {
throws Exception { throws Exception {
this.conf = new Configuration(); this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode); this.conf.set("fs.defaultFS", hdfsNameNode);
HttpConnector httpConnector = new HttpConnector(); HttpConnector2 httpConnector = new HttpConnector2();
FileSystem fileSystem = FileSystem.get(this.conf); FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath); Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null; FSDataOutputStream fsDataOutputStream = null;
@ -84,7 +83,6 @@ public class ReadExcel implements Closeable {
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
this.excelFile = httpConnector.getInputSourceAsStream(fileURL); this.excelFile = httpConnector.getInputSourceAsStream(fileURL);
;
} }
protected void write(final Object p) { protected void write(final Object p) {

View File

@ -3,11 +3,11 @@ package eu.dnetlib.dhp.actionmanager.ror;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION; import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.aggregation.common;
import java.io.Serializable;
import org.apache.spark.util.LongAccumulator;
public class AggregationCounter implements Serializable {
private LongAccumulator totalItems;
private LongAccumulator errorItems;
private LongAccumulator processedItems;
public AggregationCounter() {
}
public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) {
this.totalItems = totalItems;
this.errorItems = errorItems;
this.processedItems = processedItems;
}
public LongAccumulator getTotalItems() {
return totalItems;
}
public void setTotalItems(LongAccumulator totalItems) {
this.totalItems = totalItems;
}
public LongAccumulator getErrorItems() {
return errorItems;
}
public void setErrorItems(LongAccumulator errorItems) {
this.errorItems = errorItems;
}
public LongAccumulator getProcessedItems() {
return processedItems;
}
public void setProcessedItems(LongAccumulator processedItems) {
this.processedItems = processedItems;
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.aggregation.common;
import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.dhp.message.MessageSender;
import eu.dnetlib.dhp.utils.DHPUtils;
public class AggregatorReport extends LinkedHashMap<String, String> implements Closeable {
private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class);
private MessageSender messageSender;
public AggregatorReport() {
}
public AggregatorReport(MessageSender messageSender) throws IOException {
this.messageSender = messageSender;
}
public void ongoing(Long current, Long total) {
messageSender.sendMessage(current, total);
}
@Override
public void close() throws IOException {
if (Objects.nonNull(messageSender)) {
log.info("closing report: ");
this.forEach((k, v) -> log.info("{} - {}", k, v));
Map<String, String> m = new HashMap<>();
m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values()));
messageSender.sendReport(m);
}
}
}

View File

@ -0,0 +1,10 @@
package eu.dnetlib.dhp.aggregation.common;
public interface ReporterCallback {
Long getCurrent();
Long getTotal();
}

View File

@ -0,0 +1,41 @@
package eu.dnetlib.dhp.aggregation.common;
import java.util.TimerTask;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public abstract class ReportingJob {
/**
* Frequency (seconds) for sending ongoing messages to report the collection task advancement
*/
public static final int ONGOING_REPORT_FREQUENCY = 5;
/**
* Initial delay (seconds) for sending ongoing messages to report the collection task advancement
*/
public static final int INITIAL_DELAY = 2;
private final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
protected final AggregatorReport report;
public ReportingJob(AggregatorReport report) {
this.report = report;
}
protected void schedule(final ReporterCallback callback) {
executor.scheduleAtFixedRate(new TimerTask() {
@Override
public void run() {
report.ongoing(callback.getCurrent(), callback.getTotal());
}
}, INITIAL_DELAY, ONGOING_REPORT_FREQUENCY, TimeUnit.SECONDS);
}
protected void shutdown() {
executor.shutdown();
}
}

View File

@ -0,0 +1,136 @@
package eu.dnetlib.dhp.aggregation.mdstore;
import static eu.dnetlib.dhp.common.Constants.*;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.net.URI;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.rest.DNetRestClient;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
public class MDStoreActionNode {
private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class);
enum MDAction {
NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK
}
public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion";
public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s";
public static final String ROLLBACK_VERSION_URL = "%s/version/%s/abort";
public static final String READ_LOCK_URL = "%s/mdstore/%s/startReading";
public static final String READ_UNLOCK_URL = "%s/version/%s/endReading";
private static final String MDSTOREVERSIONPARAM = "mdStoreVersion";
private static final String MDSTOREREADLOCKPARAM = "mdStoreReadLockVersion";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
IOUtils
.toString(
MDStoreActionNode.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/mdstore_action_parameters.json")));
argumentParser.parseArgument(args);
log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024));
final MDAction action = MDAction.valueOf(argumentParser.get("action"));
log.info("Current action is {}", action);
final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI");
log.info("mdStoreManagerURI is {}", mdStoreManagerURI);
switch (action) {
case NEW_VERSION: {
final String mdStoreID = argumentParser.get("mdStoreID");
if (StringUtils.isBlank(mdStoreID)) {
throw new IllegalArgumentException("missing or empty argument mdStoreId");
}
final MDStoreVersion currentVersion = DNetRestClient
.doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
populateOOZIEEnv(MDSTOREVERSIONPARAM, MAPPER.writeValueAsString(currentVersion));
break;
}
case COMMIT: {
final String hdfsuri = argumentParser.get("namenode");
if (StringUtils.isBlank(hdfsuri)) {
throw new IllegalArgumentException("missing or empty argument namenode");
}
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
if (StringUtils.isBlank(mdStoreVersion.getId())) {
throw new IllegalArgumentException(
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
}
Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + MDSTORE_SIZE_PATH);
try (
FileSystem fs = FileSystem.get(URI.create(hdfsuri), getHadoopConfiguration(hdfsuri));
FSDataInputStream inputStream = fs.open(hdfstoreSizepath)) {
final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream));
fs.create(hdfstoreSizepath);
DNetRestClient
.doGET(
String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize));
}
break;
}
case ROLLBACK: {
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
if (StringUtils.isBlank(mdStoreVersion.getId())) {
throw new IllegalArgumentException(
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
}
DNetRestClient.doGET(String.format(ROLLBACK_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId()));
break;
}
case READ_LOCK: {
final String mdStoreID = argumentParser.get("mdStoreID");
if (StringUtils.isBlank(mdStoreID)) {
throw new IllegalArgumentException("missing or empty argument mdStoreId");
}
final MDStoreVersion currentVersion = DNetRestClient
.doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
populateOOZIEEnv(MDSTOREREADLOCKPARAM, MAPPER.writeValueAsString(currentVersion));
break;
}
case READ_UNLOCK: {
final String mdStoreVersion_params = argumentParser.get("readMDStoreId");
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
if (StringUtils.isBlank(mdStoreVersion.getId())) {
throw new IllegalArgumentException(
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
}
DNetRestClient.doGET(String.format(READ_UNLOCK_URL, mdStoreManagerURI, mdStoreVersion.getId()));
break;
}
default:
throw new IllegalArgumentException("invalid action");
}
}
}

View File

@ -1,16 +1,16 @@
package eu.dnetlib.dhp.collection.worker; package eu.dnetlib.dhp.collection;
public class DnetCollectorException extends Exception { public class CollectorException extends Exception {
/** */ /** */
private static final long serialVersionUID = -290723075076039757L; private static final long serialVersionUID = -290723075076039757L;
public DnetCollectorException() { public CollectorException() {
super(); super();
} }
public DnetCollectorException( public CollectorException(
final String message, final String message,
final Throwable cause, final Throwable cause,
final boolean enableSuppression, final boolean enableSuppression,
@ -18,15 +18,15 @@ public class DnetCollectorException extends Exception {
super(message, cause, enableSuppression, writableStackTrace); super(message, cause, enableSuppression, writableStackTrace);
} }
public DnetCollectorException(final String message, final Throwable cause) { public CollectorException(final String message, final Throwable cause) {
super(message, cause); super(message, cause);
} }
public DnetCollectorException(final String message) { public CollectorException(final String message) {
super(message); super(message);
} }
public DnetCollectorException(final Throwable cause) { public CollectorException(final Throwable cause) {
super(cause); super(cause);
} }
} }

View File

@ -0,0 +1,134 @@
package eu.dnetlib.dhp.collection;
import static eu.dnetlib.dhp.common.Constants.SEQUENCE_FILE_NAME;
import java.io.IOException;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.DeflateCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
public class CollectorWorker extends ReportingJob {
private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class);
private final ApiDescriptor api;
private final FileSystem fileSystem;
private final MDStoreVersion mdStoreVersion;
private final HttpClientParams clientParams;
public CollectorWorker(
final ApiDescriptor api,
final FileSystem fileSystem,
final MDStoreVersion mdStoreVersion,
final HttpClientParams clientParams,
final AggregatorReport report) {
super(report);
this.api = api;
this.fileSystem = fileSystem;
this.mdStoreVersion = mdStoreVersion;
this.clientParams = clientParams;
}
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
log.info("outputPath path is {}", outputPath);
final CollectorPlugin plugin = getCollectorPlugin();
final AtomicInteger counter = new AtomicInteger(0);
scheduleReport(counter);
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
fileSystem.getConf(),
SequenceFile.Writer.file(new Path(outputPath)),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final IntWritable key = new IntWritable(counter.get());
final Text value = new Text();
plugin
.collect(api, report)
.forEach(
content -> {
key.set(counter.getAndIncrement());
value.set(content);
try {
writer.append(key, value);
} catch (Throwable e) {
throw new RuntimeException(e);
}
});
} catch (Throwable e) {
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e);
} finally {
shutdown();
report.ongoing(counter.longValue(), counter.longValue());
}
}
private void scheduleReport(AtomicInteger counter) {
schedule(new ReporterCallback() {
@Override
public Long getCurrent() {
return counter.longValue();
}
@Override
public Long getTotal() {
return null;
}
});
}
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
case oai:
return new OaiCollectorPlugin(clientParams);
case rest_json2xml:
return new RestCollectorPlugin(clientParams);
case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(api.getParams().get("other_plugin_type"))
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
.get();
switch (plugin) {
case mdstore_mongodb_dump:
return new MongoDbDumpCollectorPlugin(fileSystem);
case mdstore_mongodb:
return new MDStoreCollectorPlugin();
default:
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
}
default:
throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
}
}
}

View File

@ -0,0 +1,135 @@
package eu.dnetlib.dhp.collection;
import static eu.dnetlib.dhp.common.Constants.*;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.io.IOException;
import java.util.Optional;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.message.MessageSender;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
/**
* CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes
* into HDFS. This application will be executed on the hadoop cluster, where invoked in the context of the metadata collection
* oozie workflow, it will receive all the input parameters necessary to instantiate the specific collection plugin and the
* relative specific configurations
*
* @author Sandro La Bruzzo, Claudio Atzori
*/
public class CollectorWorkerApplication {
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
private final FileSystem fileSystem;
public CollectorWorkerApplication(FileSystem fileSystem) {
this.fileSystem = fileSystem;
}
/**
* @param args
*/
public static void main(final String[] args)
throws ParseException, IOException, UnknownCollectorPluginException, CollectorException {
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
IOUtils
.toString(
CollectorWorkerApplication.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json")));
argumentParser.parseArgument(args);
log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024));
final String hdfsuri = argumentParser.get("namenode");
log.info("hdfsURI is {}", hdfsuri);
final String apiDescriptor = argumentParser.get("apidescriptor");
log.info("apiDescriptor is {}", apiDescriptor);
final String mdStoreVersion = argumentParser.get("mdStoreVersion");
log.info("mdStoreVersion is {}", mdStoreVersion);
final String dnetMessageManagerURL = argumentParser.get(DNET_MESSAGE_MGR_URL);
log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL);
final String workflowId = argumentParser.get("workflowId");
log.info("workflowId is {}", workflowId);
final HttpClientParams clientParams = getClientParams(argumentParser);
final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class);
final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsuri));
new CollectorWorkerApplication(fileSystem)
.run(mdStoreVersion, clientParams, api, dnetMessageManagerURL, workflowId);
}
protected void run(String mdStoreVersion, HttpClientParams clientParams, ApiDescriptor api,
String dnetMessageManagerURL, String workflowId)
throws IOException, CollectorException, UnknownCollectorPluginException {
final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
final MessageSender ms = new MessageSender(dnetMessageManagerURL, workflowId);
try (AggregatorReport report = new AggregatorReport(ms)) {
new CollectorWorker(api, fileSystem, currentVersion, clientParams, report).collect();
}
}
private static HttpClientParams getClientParams(ArgumentApplicationParser argumentParser) {
final HttpClientParams clientParams = new HttpClientParams();
clientParams
.setMaxNumberOfRetry(
Optional
.ofNullable(argumentParser.get(MAX_NUMBER_OF_RETRY))
.map(Integer::parseInt)
.orElse(HttpClientParams._maxNumberOfRetry));
log.info("maxNumberOfRetry is {}", clientParams.getMaxNumberOfRetry());
clientParams
.setRequestDelay(
Optional
.ofNullable(argumentParser.get(REQUEST_DELAY))
.map(Integer::parseInt)
.orElse(HttpClientParams._requestDelay));
log.info("requestDelay is {}", clientParams.getRequestDelay());
clientParams
.setRetryDelay(
Optional
.ofNullable(argumentParser.get(RETRY_DELAY))
.map(Integer::parseInt)
.orElse(HttpClientParams._retryDelay));
log.info("retryDelay is {}", clientParams.getRetryDelay());
clientParams
.setConnectTimeOut(
Optional
.ofNullable(argumentParser.get(CONNECT_TIMEOUT))
.map(Integer::parseInt)
.orElse(HttpClientParams._connectTimeOut));
log.info("connectTimeOut is {}", clientParams.getConnectTimeOut());
clientParams
.setReadTimeOut(
Optional
.ofNullable(argumentParser.get(READ_TIMEOUT))
.map(Integer::parseInt)
.orElse(HttpClientParams._readTimeOut));
log.info("readTimeOut is {}", clientParams.getReadTimeOut());
return clientParams;
}
}

View File

@ -1,28 +1,26 @@
package eu.dnetlib.dhp.collection; package eu.dnetlib.dhp.collection;
import static eu.dnetlib.dhp.common.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.cli.*;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoder; import org.apache.spark.sql.*;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.Node; import org.dom4j.Node;
@ -30,19 +28,172 @@ import org.dom4j.io.SAXReader;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
import eu.dnetlib.dhp.model.mdstore.Provenance; import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
import eu.dnetlib.message.Message; import eu.dnetlib.dhp.schema.mdstore.Provenance;
import eu.dnetlib.message.MessageManager; import scala.Tuple2;
import eu.dnetlib.message.MessageType;
public class GenerateNativeStoreSparkJob { public class GenerateNativeStoreSparkJob {
private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class); private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class);
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenerateNativeStoreSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/generate_native_input_parameters.json")));
parser.parseArgument(args);
final String provenanceArgument = parser.get("provenance");
log.info("Provenance is {}", provenanceArgument);
final Provenance provenance = MAPPER.readValue(provenanceArgument, Provenance.class);
final String dateOfCollectionArgs = parser.get("dateOfCollection");
log.info("dateOfCollection is {}", dateOfCollectionArgs);
final Long dateOfCollection = new Long(dateOfCollectionArgs);
String mdStoreVersion = parser.get("mdStoreVersion");
log.info("mdStoreVersion is {}", mdStoreVersion);
final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
String readMdStoreVersionParam = parser.get("readMdStoreVersion");
log.info("readMdStoreVersion is {}", readMdStoreVersionParam);
final MDStoreVersion readMdStoreVersion = StringUtils.isBlank(readMdStoreVersionParam) ? null
: MAPPER.readValue(readMdStoreVersionParam, MDStoreVersion.class);
final String xpath = parser.get("xpath");
log.info("xpath is {}", xpath);
final String encoding = parser.get("encoding");
log.info("encoding is {}", encoding);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> createNativeMDStore(
spark, provenance, dateOfCollection, xpath, encoding, currentVersion, readMdStoreVersion));
}
private static void createNativeMDStore(SparkSession spark,
Provenance provenance,
Long dateOfCollection,
String xpath,
String encoding,
MDStoreVersion currentVersion,
MDStoreVersion readVersion) throws IOException {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final LongAccumulator totalItems = sc.sc().longAccumulator(CONTENT_TOTALITEMS);
final LongAccumulator invalidRecords = sc.sc().longAccumulator(CONTENT_INVALIDRECORDS);
final String seqFilePath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
final JavaRDD<MetadataRecord> nativeStore = sc
.sequenceFile(seqFilePath, IntWritable.class, Text.class)
.map(
item -> parseRecord(
item._2().toString(),
xpath,
encoding,
provenance,
dateOfCollection,
totalItems,
invalidRecords))
.filter(Objects::nonNull)
.distinct();
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
final Dataset<MetadataRecord> mdstore = spark.createDataset(nativeStore.rdd(), encoder);
final String targetPath = currentVersion.getHdfsPath() + MDSTORE_DATA_PATH;
if (readVersion != null) { // INCREMENTAL MODE
log.info("updating {} incrementally with {}", targetPath, readVersion.getHdfsPath());
Dataset<MetadataRecord> currentMdStoreVersion = spark
.read()
.load(readVersion.getHdfsPath() + MDSTORE_DATA_PATH)
.as(encoder);
TypedColumn<MetadataRecord, MetadataRecord> aggregator = new MDStoreAggregator().toColumn();
final Dataset<MetadataRecord> map = currentMdStoreVersion
.union(mdstore)
.groupByKey(
(MapFunction<MetadataRecord, String>) MetadataRecord::getId,
Encoders.STRING())
.agg(aggregator)
.map((MapFunction<Tuple2<String, MetadataRecord>, MetadataRecord>) Tuple2::_2, encoder);
map.select("id").takeAsList(100).forEach(s -> log.info(s.toString()));
saveDataset(map, targetPath);
} else {
saveDataset(mdstore, targetPath);
}
final Long total = spark.read().load(targetPath).count();
log.info("collected {} records for datasource '{}'", total, provenance.getDatasourceName());
writeHdfsFile(
spark.sparkContext().hadoopConfiguration(), total.toString(),
currentVersion.getHdfsPath() + MDSTORE_SIZE_PATH);
}
public static class MDStoreAggregator extends Aggregator<MetadataRecord, MetadataRecord, MetadataRecord> {
@Override
public MetadataRecord zero() {
return null;
}
@Override
public MetadataRecord reduce(MetadataRecord b, MetadataRecord a) {
return getLatestRecord(b, a);
}
@Override
public MetadataRecord merge(MetadataRecord b, MetadataRecord a) {
return getLatestRecord(b, a);
}
private MetadataRecord getLatestRecord(MetadataRecord b, MetadataRecord a) {
if (b == null)
return a;
if (a == null)
return b;
return (a.getDateOfCollection() > b.getDateOfCollection()) ? a : b;
}
@Override
public MetadataRecord finish(MetadataRecord r) {
return r;
}
@Override
public Encoder<MetadataRecord> bufferEncoder() {
return Encoders.bean(MetadataRecord.class);
}
@Override
public Encoder<MetadataRecord> outputEncoder() {
return Encoders.bean(MetadataRecord.class);
}
}
public static MetadataRecord parseRecord( public static MetadataRecord parseRecord(
final String input, final String input,
final String xpath, final String xpath,
@ -64,112 +215,11 @@ public class GenerateNativeStoreSparkJob {
invalidRecords.add(1); invalidRecords.add(1);
return null; return null;
} }
return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection); return new MetadataRecord(originalIdentifier, encoding, provenance, document.asXML(), dateOfCollection);
} catch (Throwable e) { } catch (Throwable e) {
if (invalidRecords != null) invalidRecords.add(1);
invalidRecords.add(1);
e.printStackTrace();
return null; return null;
} }
} }
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenerateNativeStoreSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/collection_input_parameters.json")));
parser.parseArgument(args);
final ObjectMapper jsonMapper = new ObjectMapper();
final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class);
final long dateOfCollection = new Long(parser.get("dateOfCollection"));
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final Map<String, String> ongoingMap = new HashMap<>();
final Map<String, String> reportMap = new HashMap<>();
final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaPairRDD<IntWritable, Text> inputRDD = sc
.sequenceFile(parser.get("input"), IntWritable.class, Text.class);
final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems");
final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords");
final MessageManager manager = new MessageManager(
parser.get("rabbitHost"),
parser.get("rabbitUser"),
parser.get("rabbitPassword"),
false,
false,
null);
final JavaRDD<MetadataRecord> mappeRDD = inputRDD
.map(
item -> parseRecord(
item._2().toString(),
parser.get("xpath"),
parser.get("encoding"),
provenance,
dateOfCollection,
totalItems,
invalidRecords))
.filter(Objects::nonNull)
.distinct();
ongoingMap.put("ongoing", "0");
if (!test) {
manager
.sendMessage(
new Message(
parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap),
parser.get("rabbitOngoingQueue"),
true,
false);
}
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
final Dataset<MetadataRecord> mdstore = spark.createDataset(mappeRDD.rdd(), encoder);
final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords");
mdStoreRecords.add(mdstore.count());
ongoingMap.put("ongoing", "" + totalItems.value());
if (!test) {
manager
.sendMessage(
new Message(
parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap),
parser.get("rabbitOngoingQueue"),
true,
false);
}
mdstore.write().format("parquet").save(parser.get("output"));
reportMap.put("inputItem", "" + totalItems.value());
reportMap.put("invalidRecords", "" + invalidRecords.value());
reportMap.put("mdStoreSize", "" + mdStoreRecords.value());
if (!test) {
manager
.sendMessage(
new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
parser.get("rabbitReportQueue"),
true,
false);
manager.close();
}
});
}
} }

View File

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.collection;
/**
* Bundles the http connection parameters driving the client behaviour.
*/
public class HttpClientParams {
// Defaults
public static int _maxNumberOfRetry = 3;
public static int _requestDelay = 0; // milliseconds
public static int _retryDelay = 10; // seconds
public static int _connectTimeOut = 10; // seconds
public static int _readTimeOut = 30; // seconds
/**
* Maximum number of allowed retires before failing
*/
private int maxNumberOfRetry;
/**
* Delay between request (Milliseconds)
*/
private int requestDelay;
/**
* Time to wait after a failure before retrying (Seconds)
*/
private int retryDelay;
/**
* Connect timeout (Seconds)
*/
private int connectTimeOut;
/**
* Read timeout (Seconds)
*/
private int readTimeOut;
public HttpClientParams() {
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut);
}
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
int readTimeOut) {
this.maxNumberOfRetry = maxNumberOfRetry;
this.requestDelay = requestDelay;
this.retryDelay = retryDelay;
this.connectTimeOut = connectTimeOut;
this.readTimeOut = readTimeOut;
}
public int getMaxNumberOfRetry() {
return maxNumberOfRetry;
}
public void setMaxNumberOfRetry(int maxNumberOfRetry) {
this.maxNumberOfRetry = maxNumberOfRetry;
}
public int getRequestDelay() {
return requestDelay;
}
public void setRequestDelay(int requestDelay) {
this.requestDelay = requestDelay;
}
public int getRetryDelay() {
return retryDelay;
}
public void setRetryDelay(int retryDelay) {
this.retryDelay = retryDelay;
}
public void setConnectTimeOut(int connectTimeOut) {
this.connectTimeOut = connectTimeOut;
}
public int getConnectTimeOut() {
return connectTimeOut;
}
public int getReadTimeOut() {
return readTimeOut;
}
public void setReadTimeOut(int readTimeOut) {
this.readTimeOut = readTimeOut;
}
}

View File

@ -0,0 +1,251 @@
package eu.dnetlib.dhp.collection;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.io.IOException;
import java.io.InputStream;
import java.net.*;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.http.HttpHeaders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
/**
* Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java
*
* @author jochen, michele, andrea, alessia, claudio
*/
public class HttpConnector2 {
private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class);
private static final String REPORT_PREFIX = "http:";
private HttpClientParams clientParams;
private String responseType = null;
private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
public HttpConnector2() {
this(new HttpClientParams());
}
public HttpConnector2(HttpClientParams clientParams) {
this.clientParams = clientParams;
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
}
/**
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
*/
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException {
return IOUtils.toInputStream(getInputSource(requestUrl));
}
/**
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
*/
public String getInputSource(final String requestUrl) throws CollectorException {
return attemptDownloadAsString(requestUrl, 1, new AggregatorReport());
}
/**
* Given the URL returns the content via HTTP GET
*
* @param requestUrl the URL
* @param report the list of errors
* @return the content of the downloaded resource
* @throws CollectorException when retrying more than maxNumberOfRetry times
*/
public String getInputSource(final String requestUrl, AggregatorReport report)
throws CollectorException {
return attemptDownloadAsString(requestUrl, 1, report);
}
private String attemptDownloadAsString(final String requestUrl, final int retryNumber,
final AggregatorReport report) throws CollectorException {
try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) {
return IOUtils.toString(s);
} catch (IOException e) {
log.error(e.getMessage(), e);
throw new CollectorException(e);
}
}
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
final AggregatorReport report) throws CollectorException, IOException {
if (retryNumber > getClientParams().getMaxNumberOfRetry()) {
final String msg = String
.format(
"Max number of retries (%s/%s) exceeded, failing.",
retryNumber, getClientParams().getMaxNumberOfRetry());
log.error(msg);
throw new CollectorException(msg);
}
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
InputStream input = null;
try {
if (getClientParams().getRequestDelay() > 0) {
backoffAndSleep(getClientParams().getRequestDelay());
}
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
urlConn.setInstanceFollowRedirects(false);
urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000);
urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000);
urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent);
if (log.isDebugEnabled()) {
logHeaderFields(urlConn);
}
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
if (is2xx(urlConn.getResponseCode())) {
input = urlConn.getInputStream();
responseType = urlConn.getContentType();
return input;
}
if (is3xx(urlConn.getResponseCode())) {
// REDIRECTS
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
log.info(String.format("The requested url has been moved to %s", newUrl));
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String.format("Moved to: %s", newUrl));
urlConn.disconnect();
if (retryAfter > 0) {
backoffAndSleep(retryAfter);
}
return attemptDownload(newUrl, retryNumber + 1, report);
}
if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) {
switch (urlConn.getResponseCode()) {
case HttpURLConnection.HTTP_NOT_FOUND:
case HttpURLConnection.HTTP_BAD_GATEWAY:
case HttpURLConnection.HTTP_UNAVAILABLE:
case HttpURLConnection.HTTP_GATEWAY_TIMEOUT:
if (retryAfter > 0) {
log
.warn(
requestUrl + " - waiting and repeating request after suggested retry-after "
+ retryAfter + " sec.");
backoffAndSleep(retryAfter * 1000);
} else {
log
.warn(
requestUrl + " - waiting and repeating request after default delay of "
+ getClientParams().getRetryDelay() + " sec.");
backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000);
}
report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl);
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, report);
default:
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String
.format(
"%s Error: %s", requestUrl, urlConn.getResponseMessage()));
throw new CollectorException(urlConn.getResponseCode() + " error " + report);
}
}
throw new CollectorException(
String
.format(
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
MAPPER.writeValueAsString(report)));
} catch (MalformedURLException | UnknownHostException e) {
log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e.getMessage(), e);
} catch (SocketTimeoutException | SocketException e) {
log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage());
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
return attemptDownload(requestUrl, retryNumber + 1, report);
}
}
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
log.debug("StatusCode: " + urlConn.getResponseMessage());
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
if (e.getKey() != null) {
for (String v : e.getValue()) {
log.debug(" key: " + e.getKey() + " - value: " + v);
}
}
}
}
private void backoffAndSleep(int sleepTimeMs) throws CollectorException {
log.info("I'm going to sleep for {}ms", sleepTimeMs);
try {
Thread.sleep(sleepTimeMs);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
throw new CollectorException(e);
}
}
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
for (String key : headerMap.keySet()) {
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (headerMap.get(key).size() > 0)
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
}
}
return -1;
}
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorException {
for (String key : headerMap.keySet()) {
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) {
return headerMap.get(key).get(0);
}
}
throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING");
}
private boolean is2xx(final int statusCode) {
return statusCode >= 200 && statusCode <= 299;
}
private boolean is4xx(final int statusCode) {
return statusCode >= 400 && statusCode <= 499;
}
private boolean is3xx(final int statusCode) {
return statusCode >= 300 && statusCode <= 399;
}
private boolean is5xx(final int statusCode) {
return statusCode >= 500 && statusCode <= 599;
}
public String getResponseType() {
return responseType;
}
public HttpClientParams getClientParams() {
return clientParams;
}
public void setClientParams(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
}

View File

@ -0,0 +1,84 @@
package eu.dnetlib.dhp.collection;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class JsonUtils {
private static final Log log = LogFactory.getLog(JsonUtils.class);
public static final String wrapName = "recordWrap";
/**
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
* and work-around for the JSON to XML converting of org.json.XML-package.
*
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"],
*
* @param jsonInput
* @return convertedJsonKeynameOutput
*/
public String syntaxConvertJsonKeyNames(String jsonInput) {
log.trace("before convertJsonKeyNames: " + jsonInput);
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
// replace ' 's in JSON Namens with '_'
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
}
// replace forward-slash (sign '/' ) in JSON Names with '_'
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
}
// replace '(' in JSON Names with ''
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
}
// replace ')' in JSON Names with ''
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
}
// add prefix of startNumbers in JSON Keynames with 'n_'
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
}
// add prefix of only numbers in JSON Keynames with 'm_'
while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
}
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
}
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
// }
// replace '=' in JSON Keynames with '-'
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
}
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
return jsonInput;
}
public String convertToXML(final String jsonRecord) {
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord));
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
log.trace("before inputStream: " + resultXml);
resultXml = XmlCleaner.cleanAllEntities(resultXml);
log.trace("after cleaning: " + resultXml);
return resultXml;
}
}

View File

@ -0,0 +1,32 @@
package eu.dnetlib.dhp.collection;
public class UnknownCollectorPluginException extends Exception {
/** */
private static final long serialVersionUID = -290723075076039757L;
public UnknownCollectorPluginException() {
super();
}
public UnknownCollectorPluginException(
final String message,
final Throwable cause,
final boolean enableSuppression,
final boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
public UnknownCollectorPluginException(final String message, final Throwable cause) {
super(message, cause);
}
public UnknownCollectorPluginException(final String message) {
super(message);
}
public UnknownCollectorPluginException(final Throwable cause) {
super(cause);
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.collection.worker.utils; package eu.dnetlib.dhp.collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;

View File

@ -3,10 +3,21 @@ package eu.dnetlib.dhp.collection.plugin;
import java.util.stream.Stream; import java.util.stream.Stream;
import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException; import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
public interface CollectorPlugin { public interface CollectorPlugin {
Stream<String> collect(ApiDescriptor api) throws DnetCollectorException; enum NAME {
oai, other, rest_json2xml;
public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb
}
}
Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException;
} }

View File

@ -0,0 +1,60 @@
package eu.dnetlib.dhp.collection.plugin.mongodb;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.client.MongoCollection;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.MdstoreClient;
public class MDStoreCollectorPlugin implements CollectorPlugin {
private static final Logger log = LoggerFactory.getLogger(MDStoreCollectorPlugin.class);
public static final String MONGODB_DBNAME = "mongodb_dbname";
public static final String MDSTORE_ID = "mdstore_id";
@Override
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
final String mongoBaseUrl = Optional
.ofNullable(api.getBaseUrl())
.orElseThrow(
() -> new CollectorException(
"missing mongodb baseUrl, expected in eu.dnetlib.dhp.collection.ApiDescriptor.baseUrl"));
log.info("mongoBaseUrl: {}", mongoBaseUrl);
final String dbName = Optional
.ofNullable(api.getParams().get(MONGODB_DBNAME))
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_DBNAME)));
log.info("dbName: {}", dbName);
final String mdId = Optional
.ofNullable(api.getParams().get(MDSTORE_ID))
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MDSTORE_ID)));
log.info("mdId: {}", mdId);
final MdstoreClient client = new MdstoreClient(mongoBaseUrl, dbName);
final MongoCollection<Document> mdstore = client.mdStore(mdId);
long size = mdstore.count();
return StreamSupport
.stream(
Spliterators.spliterator(mdstore.find().iterator(), size, Spliterator.SIZED), false)
.map(doc -> doc.getString("body"));
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.collection.plugin.mongodb;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Optional;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.utils.DHPUtils;
public class MongoDbDumpCollectorPlugin implements CollectorPlugin {
public static final String PATH_PARAM = "path";
public static final String BODY_JSONPATH = "$.body";
public FileSystem fileSystem;
public MongoDbDumpCollectorPlugin(FileSystem fileSystem) {
this.fileSystem = fileSystem;
}
@Override
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
final Path path = Optional
.ofNullable(api.getParams().get("path"))
.map(Path::new)
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", PATH_PARAM)));
try {
if (!fileSystem.exists(path)) {
throw new CollectorException("path does not exist: " + path.toString());
}
return new BufferedReader(
new InputStreamReader(new GZIPInputStream(fileSystem.open(path)), Charset.defaultCharset()))
.lines()
.map(s -> DHPUtils.getJPathString(BODY_JSONPATH, s));
} catch (IOException e) {
throw new CollectorException(e);
}
}
}

View File

@ -13,12 +13,17 @@ import com.google.common.base.Splitter;
import com.google.common.collect.Iterators; import com.google.common.collect.Iterators;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.HttpClientParams;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
public class OaiCollectorPlugin implements CollectorPlugin { public class OaiCollectorPlugin implements CollectorPlugin {
public static final String DATE_REGEX = "\\d{4}-\\d{2}-\\d{2}";
public static final String UTC_DATETIME_REGEX = "\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z";
private static final String FORMAT_PARAM = "format"; private static final String FORMAT_PARAM = "format";
private static final String OAI_SET_PARAM = "set"; private static final String OAI_SET_PARAM = "set";
private static final Object OAI_FROM_DATE_PARAM = "fromDate"; private static final Object OAI_FROM_DATE_PARAM = "fromDate";
@ -26,8 +31,15 @@ public class OaiCollectorPlugin implements CollectorPlugin {
private OaiIteratorFactory oaiIteratorFactory; private OaiIteratorFactory oaiIteratorFactory;
private HttpClientParams clientParams;
public OaiCollectorPlugin(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
@Override @Override
public Stream<String> collect(final ApiDescriptor api) throws DnetCollectorException { public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report)
throws CollectorException {
final String baseUrl = api.getBaseUrl(); final String baseUrl = api.getBaseUrl();
final String mdFormat = api.getParams().get(FORMAT_PARAM); final String mdFormat = api.getParams().get(FORMAT_PARAM);
final String setParam = api.getParams().get(OAI_SET_PARAM); final String setParam = api.getParams().get(OAI_SET_PARAM);
@ -46,26 +58,26 @@ public class OaiCollectorPlugin implements CollectorPlugin {
} }
if (baseUrl == null || baseUrl.isEmpty()) { if (baseUrl == null || baseUrl.isEmpty()) {
throw new DnetCollectorException("Param 'baseurl' is null or empty"); throw new CollectorException("Param 'baseurl' is null or empty");
} }
if (mdFormat == null || mdFormat.isEmpty()) { if (mdFormat == null || mdFormat.isEmpty()) {
throw new DnetCollectorException("Param 'mdFormat' is null or empty"); throw new CollectorException("Param 'mdFormat' is null or empty");
} }
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { if (fromDate != null && !fromDate.matches(DATE_REGEX) && !fromDate.matches(UTC_DATETIME_REGEX)) {
throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
} }
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { if (untilDate != null && !untilDate.matches(DATE_REGEX) && !untilDate.matches(UTC_DATETIME_REGEX)) {
throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
} }
final Iterator<Iterator<String>> iters = sets final Iterator<Iterator<String>> iters = sets
.stream() .stream()
.map( .map(
set -> getOaiIteratorFactory() set -> getOaiIteratorFactory()
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) .newIterator(baseUrl, mdFormat, set, fromDate, untilDate, getClientParams(), report))
.iterator(); .iterator();
return StreamSupport return StreamSupport
@ -79,4 +91,12 @@ public class OaiCollectorPlugin implements CollectorPlugin {
} }
return oaiIteratorFactory; return oaiIteratorFactory;
} }
public HttpClientParams getClientParams() {
return clientParams;
}
public void setClientParams(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
} }

View File

@ -1,7 +1,9 @@
package eu.dnetlib.dhp.collection.plugin.oai; package eu.dnetlib.dhp.collection.plugin.oai;
import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.Iterator; import java.util.Iterator;
@ -9,24 +11,28 @@ import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.PriorityBlockingQueue;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Node; import org.dom4j.Node;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import org.dom4j.io.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException; import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; import eu.dnetlib.dhp.collection.HttpConnector2;
import eu.dnetlib.dhp.collection.XmlCleaner;
public class OaiIterator implements Iterator<String> { public class OaiIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on private static final Logger log = LoggerFactory.getLogger(OaiIterator.class);
// 11/24/08 5:02 PM
private final static String REPORT_PREFIX = "oai:";
private final Queue<String> queue = new PriorityBlockingQueue<>(); private final Queue<String> queue = new PriorityBlockingQueue<>();
private final SAXReader reader = new SAXReader();
private final String baseUrl; private final String baseUrl;
private final String set; private final String set;
@ -35,7 +41,8 @@ public class OaiIterator implements Iterator<String> {
private final String untilDate; private final String untilDate;
private String token; private String token;
private boolean started; private boolean started;
private final HttpConnector httpConnector; private final HttpConnector2 httpConnector;
private final AggregatorReport report;
public OaiIterator( public OaiIterator(
final String baseUrl, final String baseUrl,
@ -43,7 +50,8 @@ public class OaiIterator implements Iterator<String> {
final String set, final String set,
final String fromDate, final String fromDate,
final String untilDate, final String untilDate,
final HttpConnector httpConnector) { final HttpConnector2 httpConnector,
final AggregatorReport report) {
this.baseUrl = baseUrl; this.baseUrl = baseUrl;
this.mdFormat = mdFormat; this.mdFormat = mdFormat;
this.set = set; this.set = set;
@ -51,6 +59,7 @@ public class OaiIterator implements Iterator<String> {
this.untilDate = untilDate; this.untilDate = untilDate;
this.started = false; this.started = false;
this.httpConnector = httpConnector; this.httpConnector = httpConnector;
this.report = report;
} }
private void verifyStarted() { private void verifyStarted() {
@ -58,7 +67,7 @@ public class OaiIterator implements Iterator<String> {
this.started = true; this.started = true;
try { try {
this.token = firstPage(); this.token = firstPage();
} catch (final DnetCollectorException e) { } catch (final CollectorException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
@ -80,7 +89,7 @@ public class OaiIterator implements Iterator<String> {
while (queue.isEmpty() && token != null && !token.isEmpty()) { while (queue.isEmpty() && token != null && !token.isEmpty()) {
try { try {
token = otherPages(token); token = otherPages(token);
} catch (final DnetCollectorException e) { } catch (final CollectorException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
@ -92,23 +101,26 @@ public class OaiIterator implements Iterator<String> {
public void remove() { public void remove() {
} }
private String firstPage() throws DnetCollectorException { private String firstPage() throws CollectorException {
try { try {
String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8");
if (set != null && !set.isEmpty()) { if (set != null && !set.isEmpty()) {
url += "&set=" + URLEncoder.encode(set, "UTF-8"); url += "&set=" + URLEncoder.encode(set, "UTF-8");
} }
if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { if (fromDate != null && (fromDate.matches(OaiCollectorPlugin.DATE_REGEX)
|| fromDate.matches(OaiCollectorPlugin.UTC_DATETIME_REGEX))) {
url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); url += "&from=" + URLEncoder.encode(fromDate, "UTF-8");
} }
if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { if (untilDate != null && (untilDate.matches(OaiCollectorPlugin.DATE_REGEX)
|| untilDate.matches(OaiCollectorPlugin.UTC_DATETIME_REGEX))) {
url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); url += "&until=" + URLEncoder.encode(untilDate, "UTF-8");
} }
log.info("Start harvesting using url: " + url); log.info("Start harvesting using url: " + url);
return downloadPage(url); return downloadPage(url);
} catch (final UnsupportedEncodingException e) { } catch (final UnsupportedEncodingException e) {
throw new DnetCollectorException(e); report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e);
} }
} }
@ -126,32 +138,35 @@ public class OaiIterator implements Iterator<String> {
return result.trim(); return result.trim();
} }
private String otherPages(final String resumptionToken) throws DnetCollectorException { private String otherPages(final String resumptionToken) throws CollectorException {
try { try {
return downloadPage( return downloadPage(
baseUrl baseUrl
+ "?verb=ListRecords&resumptionToken=" + "?verb=ListRecords&resumptionToken="
+ URLEncoder.encode(resumptionToken, "UTF-8")); + URLEncoder.encode(resumptionToken, "UTF-8"));
} catch (final UnsupportedEncodingException e) { } catch (final UnsupportedEncodingException e) {
throw new DnetCollectorException(e); report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e);
} }
} }
private String downloadPage(final String url) throws DnetCollectorException { private String downloadPage(final String url) throws CollectorException {
final String xml = httpConnector.getInputSource(url); final String xml = httpConnector.getInputSource(url, report);
Document doc; Document doc;
try { try {
doc = reader.read(new StringReader(xml)); doc = DocumentHelper.parseText(xml);
} catch (final DocumentException e) { } catch (final DocumentException e) {
log.warn("Error parsing xml, I try to clean it: " + xml, e); log.warn("Error parsing xml, I try to clean it. {}", e.getMessage());
report.put(e.getClass().getName(), e.getMessage());
final String cleaned = XmlCleaner.cleanAllEntities(xml); final String cleaned = XmlCleaner.cleanAllEntities(xml);
try { try {
doc = reader.read(new StringReader(cleaned)); doc = DocumentHelper.parseText(xml);
} catch (final DocumentException e1) { } catch (final DocumentException e1) {
final String resumptionToken = extractResumptionToken(xml); final String resumptionToken = extractResumptionToken(xml);
if (resumptionToken == null) { if (resumptionToken == null) {
throw new DnetCollectorException("Error parsing cleaned document:" + cleaned, e1); report.put(e1.getClass().getName(), e1.getMessage());
throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1);
} }
return resumptionToken; return resumptionToken;
} }
@ -159,19 +174,35 @@ public class OaiIterator implements Iterator<String> {
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']"); final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
if (errorNode != null) { if (errorNode != null) {
final String code = errorNode.valueOf("@code"); final String code = errorNode.valueOf("@code").trim();
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) { if ("noRecordsMatch".equalsIgnoreCase(code)) {
log.warn("noRecordsMatch for oai call: " + url); final String msg = "noRecordsMatch for oai call : " + url;
log.warn(msg);
report.put(REPORT_PREFIX + code, msg);
return null; return null;
} else { } else {
throw new DnetCollectorException(code + " - " + errorNode.getText()); final String msg = code + " - " + errorNode.getText();
report.put(REPORT_PREFIX + "error", msg);
throw new CollectorException(msg);
} }
} }
for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) { for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
queue.add(((Node) o).asXML()); final StringWriter sw = new StringWriter();
final XMLWriter writer = new XMLWriter(sw, OutputFormat.createPrettyPrint());
try {
writer.write((Node) o);
queue.add(sw.toString());
} catch (IOException e) {
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException("Error parsing XML record:\n" + ((Node) o).asXML(), e);
}
} }
return doc.valueOf("//*[local-name()='resumptionToken']"); return doc.valueOf("//*[local-name()='resumptionToken']");
} }
public AggregatorReport getReport() {
return report;
}
} }

View File

@ -3,24 +3,28 @@ package eu.dnetlib.dhp.collection.plugin.oai;
import java.util.Iterator; import java.util.Iterator;
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.HttpClientParams;
import eu.dnetlib.dhp.collection.HttpConnector2;
public class OaiIteratorFactory { public class OaiIteratorFactory {
private HttpConnector httpConnector; private HttpConnector2 httpConnector;
public Iterator<String> newIterator( public Iterator<String> newIterator(
final String baseUrl, final String baseUrl,
final String mdFormat, final String mdFormat,
final String set, final String set,
final String fromDate, final String fromDate,
final String untilDate) { final String untilDate,
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector()); final HttpClientParams clientParams,
final AggregatorReport report) {
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(clientParams), report);
} }
private HttpConnector getHttpConnector() { private HttpConnector2 getHttpConnector(HttpClientParams clientParams) {
if (httpConnector == null) if (httpConnector == null)
httpConnector = new HttpConnector(); httpConnector = new HttpConnector2(clientParams);
return httpConnector; return httpConnector;
} }
} }

View File

@ -0,0 +1,105 @@
package eu.dnetlib.dhp.collection.plugin.rest;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.HttpClientParams;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
/**
* TODO: delegate HTTP requests to the common HttpConnector2 implementation.
*
* @author js, Andreas Czerniak
* @date 2020-04-09
*
*/
public class RestCollectorPlugin implements CollectorPlugin {
public static final String RESULT_SIZE_VALUE_DEFAULT = "100";
private final HttpClientParams clientParams;
public RestCollectorPlugin(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
final String baseUrl = api.getBaseUrl();
final String resumptionType = api.getParams().get("resumptionType");
final String resumptionParam = api.getParams().get("resumptionParam");
final String resumptionXpath = api.getParams().get("resumptionXpath");
final String resultTotalXpath = api.getParams().get("resultTotalXpath");
final String resultFormatParam = api.getParams().get("resultFormatParam");
final String resultFormatValue = api.getParams().get("resultFormatValue");
final String resultSizeParam = api.getParams().get("resultSizeParam");
final String queryParams = api.getParams().get("queryParams");
final String entityXpath = api.getParams().get("entityXpath");
final String authMethod = api.getParams().get("authMethod");
final String authToken = api.getParams().get("authToken");
final String resultSizeValue = Optional
.ofNullable(api.getParams().get("resultSizeValue"))
.filter(StringUtils::isNotBlank)
.orElse(RESULT_SIZE_VALUE_DEFAULT);
if (StringUtils.isBlank(baseUrl)) {
throw new CollectorException("Param 'baseUrl' is null or empty");
}
if (StringUtils.isBlank(resumptionType)) {
throw new CollectorException("Param 'resumptionType' is null or empty");
}
if (StringUtils.isBlank(resumptionParam)) {
throw new CollectorException("Param 'resumptionParam' is null or empty");
}
if (StringUtils.isBlank(resultFormatValue)) {
throw new CollectorException("Param 'resultFormatValue' is null or empty");
}
if (StringUtils.isBlank(queryParams)) {
throw new CollectorException("Param 'queryParams' is null or empty");
}
if (StringUtils.isBlank(entityXpath)) {
throw new CollectorException("Param 'entityXpath' is null or empty");
}
final String resultOutputFormat = Optional
.ofNullable(api.getParams().get("resultOutputFormat"))
.map(String::toLowerCase)
.filter(StringUtils::isNotBlank)
.orElse(resultFormatValue.toLowerCase());
RestIterator it = new RestIterator(
getClientParams(),
baseUrl,
resumptionType,
resumptionParam,
resumptionXpath,
resultTotalXpath,
resultFormatParam,
resultFormatValue,
resultSizeParam,
resultSizeValue,
queryParams,
entityXpath,
authMethod,
authToken,
resultOutputFormat);
return StreamSupport
.stream(
Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
}
public HttpClientParams getClientParams() {
return clientParams;
}
}

View File

@ -0,0 +1,411 @@
package eu.dnetlib.dhp.collection.plugin.rest;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHeaders;
import org.apache.http.entity.ContentType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.HttpClientParams;
import eu.dnetlib.dhp.collection.JsonUtils;
/**
* log.info(...) equal to log.trace(...) in the application-logs
* <p>
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
*
* @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
* @date 2020-04-09
*
*/
public class RestIterator implements Iterator<String> {
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
public static final String UTF_8 = "UTF-8";
private final HttpClientParams clientParams;
private final String BASIC = "basic";
private final JsonUtils jsonUtils;
private final String baseUrl;
private final String resumptionType;
private final String resumptionParam;
private final String resultFormatValue;
private String queryParams;
private final int resultSizeValue;
private int resumptionInt = 0; // integer resumption token (first record to harvest)
private int resultTotal = -1;
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
// or token scanned from results)
private InputStream resultStream;
private Transformer transformer;
private XPath xpath;
private String query;
private XPathExpression xprResultTotalPath;
private XPathExpression xprResumptionPath;
private XPathExpression xprEntity;
private final String queryFormat;
private final String querySize;
private final String authMethod;
private final String authToken;
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
private int discoverResultSize = 0;
private int pagination = 1;
/*
* While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in
* json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in
* json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format.
*/
private final String resultOutputFormat;
/** RestIterator class
* compatible to version 1.3.33
*/
public RestIterator(
final HttpClientParams clientParams,
final String baseUrl,
final String resumptionType,
final String resumptionParam,
final String resumptionXpath,
final String resultTotalXpath,
final String resultFormatParam,
final String resultFormatValue,
final String resultSizeParam,
final String resultSizeValueStr,
final String queryParams,
final String entityXpath,
final String authMethod,
final String authToken,
final String resultOutputFormat) {
this.clientParams = clientParams;
this.jsonUtils = new JsonUtils();
this.baseUrl = baseUrl;
this.resumptionType = resumptionType;
this.resumptionParam = resumptionParam;
this.resultFormatValue = resultFormatValue;
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
this.queryParams = queryParams;
this.authMethod = authMethod;
this.authToken = authToken;
this.resultOutputFormat = resultOutputFormat;
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
: "";
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
try {
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
} catch (Exception e) {
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
}
initQueue();
}
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
throws TransformerConfigurationException, XPathExpressionException {
transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
xpath = XPathFactory.newInstance().newXPath();
xprResultTotalPath = xpath.compile(resultTotalXpath);
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
xprEntity = xpath.compile(entityXpath);
}
private void initQueue() {
query = baseUrl + "?" + queryParams + querySize + queryFormat;
log.info("REST calls starting with " + query);
}
private void disconnect() {
// TODO close inputstream
}
/*
* (non-Javadoc)
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
if (recordQueue.isEmpty() && query.isEmpty()) {
disconnect();
return false;
} else {
return true;
}
}
/*
* (non-Javadoc)
* @see java.util.Iterator#next()
*/
@Override
public String next() {
synchronized (recordQueue) {
while (recordQueue.isEmpty() && !query.isEmpty()) {
try {
query = downloadPage(query);
} catch (CollectorException e) {
log.debug("CollectorPlugin.next()-Exception: " + e);
throw new RuntimeException(e);
}
}
return recordQueue.poll();
}
}
/*
* download page and return nextQuery
*/
private String downloadPage(String query) throws CollectorException {
String resultJson;
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
String nextQuery = "";
String emptyXml = resultXml + "<" + JsonUtils.wrapName + "></" + JsonUtils.wrapName + ">";
Node resultNode = null;
NodeList nodeList = null;
String qUrlArgument = "";
int urlOldResumptionSize = 0;
InputStream theHttpInputStream;
// check if cursor=* is initial set otherwise add it to the queryParam URL
if (resumptionType.equalsIgnoreCase("deep-cursor")) {
log.debug("check resumptionType deep-cursor and check cursor=*?" + query);
if (!query.contains("&cursor=")) {
query += "&cursor=*";
}
}
try {
log.info("requestig URL [{}]", query);
URL qUrl = new URL(query);
log.debug("authMethod :" + authMethod);
if ("bearer".equalsIgnoreCase(this.authMethod)) {
log.trace("authMethod before inputStream: " + resultXml);
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
log.trace("authMethod before inputStream: " + resultXml);
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else {
theHttpInputStream = qUrl.openStream();
}
resultStream = theHttpInputStream;
if ("json".equals(resultOutputFormat)) {
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
resultXml = jsonUtils.convertToXML(resultJson);
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
}
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
log.debug("nodeList.length: " + nodeList.getLength());
for (int i = 0; i < nodeList.getLength(); i++) {
StringWriter sw = new StringWriter();
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
String toEnqueue = sw.toString();
if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
log.warn("The following record resulted in empty item for the feeding queue: " + resultXml);
} else {
recordQueue.add(sw.toString());
}
}
} else {
log.warn("resultXml is equal with emptyXml");
}
resumptionInt += resultSizeValue;
switch (resumptionType.toLowerCase()) {
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
resumptionStr = xprResumptionPath.evaluate(resultNode);
break;
case "count": // begin at one step for all records, iterate over items
resumptionStr = Integer.toString(resumptionInt);
break;
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
if (resultSizeValue < 2) {
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
}
qUrlArgument = qUrl.getQuery();
String[] arrayQUrlArgument = qUrlArgument.split("&");
for (String arrayUrlArgStr : arrayQUrlArgument) {
if (arrayUrlArgStr.startsWith(resumptionParam)) {
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
if (isInteger(resumptionKeyValue[1])) {
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
} else {
log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
}
}
}
if (((emptyXml).equalsIgnoreCase(resultXml))
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
// resumptionStr = "";
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
}
resultTotal = discoverResultSize;
} else {
resumptionStr = Integer.toString(resumptionInt);
resultTotal = resumptionInt + 1;
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
}
}
log.info("discoverResultSize: {}", discoverResultSize);
break;
case "pagination":
case "page": // pagination, iterate over page numbers
pagination += 1;
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
} else {
resultTotal = discoverResultSize;
pagination = discoverResultSize;
}
resumptionInt = pagination;
resumptionStr = Integer.toString(resumptionInt);
break;
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
// solr)
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
// deep-cursor, Param 'resultSizeValue' is less than 2");}
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
queryParams = queryParams.replace("&cursor=*", "");
// terminating if length of nodeList is 0
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
} else {
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
// because the iteration is over
// real length and the
// resultSizeValue is added before
// the switch()
}
discoverResultSize = nodeList.getLength();
log
.debug(
"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
break;
default: // otherwise: abort
// resultTotal = resumptionInt;
break;
}
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new IllegalStateException("collection failed: " + e.getMessage());
}
try {
if (resultTotal == -1) {
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
resultTotal += 1;
} // to correct the upper bound
log.info("resultTotal was -1 is now: " + resultTotal);
}
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
}
log.debug("resultTotal: " + resultTotal);
log.debug("resInt: " + resumptionInt);
if (resumptionInt <= resultTotal) {
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
+ queryFormat;
} else {
nextQuery = "";
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
// resumptionInt and prevent a NullPointer Exception at mdStore
}
log.debug("nextQueryUrl: " + nextQuery);
return nextQuery;
}
private boolean isInteger(String s) {
boolean isValidInteger = false;
try {
Integer.parseInt(s);
// s is a valid integer
isValidInteger = true;
} catch (NumberFormatException ex) {
// s is not an integer
}
return isValidInteger;
}
// Method to encode a string value using `UTF-8` encoding scheme
private String encodeValue(String value) {
try {
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException(ex.getCause());
}
}
public String getResultFormatValue() {
return resultFormatValue;
}
public String getResultOutputFormat() {
return resultOutputFormat;
}
}

Some files were not shown because too many files have changed in this diff Show More