Compare commits

..

11 Commits

Author SHA1 Message Date
Sandro La Bruzzo 9ca438d9b1 imported from branch stable_ids generation of Actionset datacite 2021-06-10 14:59:45 +02:00
Sandro La Bruzzo 42ff7a5665 some fix to the pom to compile scala 2021-06-10 14:31:06 +02:00
Sandro La Bruzzo ebe6aa6d38 implemented datacite transformation also on master 2021-06-10 10:52:36 +02:00
Claudio Atzori a4cfabdbc6 Merge pull request 'master' (#111) from antonis.lempesis/dnet-hadoop:master into master
Reviewed-on: D-Net/dnet-hadoop#111
2021-05-28 14:09:12 +02:00
Claudio Atzori 338327171d integrating pull #109, H2020Classification 2021-05-27 11:57:01 +02:00
Claudio Atzori 6cbda49112 more pervasive use of constants from ModelConstants, especially for ORCID 2021-05-26 18:13:04 +02:00
Claudio Atzori ea9b00ce56 adjusted test 2021-05-20 15:31:42 +02:00
Claudio Atzori 2e70aa43f0 Merge pull request 'H2020Classification fix and possibility to add datasources in blacklist for propagation of result to organization' (#108) from miriam.baglioni/dnet-hadoop:master into master
Reviewed-on: D-Net/dnet-hadoop#108

The changes look ok, but please drop a comment to describe how the parameters should be changed from the workflow caller for both workflows
* H2020Classification
* propagation of result to organization
2021-05-20 15:25:05 +02:00
Antonis Lempesis 168edcbde3 added the final steps for the observatory promote wf and some cleanup 2021-05-18 15:23:20 +03:00
Antonis Lempesis 625d993cd9 added step for observatory db 2021-04-20 02:31:06 +03:00
Antonis Lempesis 25d0512fbd code cleanup 2021-04-20 01:43:23 +03:00
470 changed files with 9776 additions and 29148 deletions

3
.gitignore vendored
View File

@ -7,8 +7,6 @@
*.iws
*~
.vscode
.metals
.bloop
.classpath
/*/.classpath
/*/*/.classpath
@ -26,5 +24,4 @@
spark-warehouse
/**/job-override.properties
/**/*.log
/**/.factorypath

View File

@ -7,7 +7,6 @@
<artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dhp-common</artifactId>
@ -54,6 +53,11 @@
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
<dependency>
<groupId>com.rabbitmq</groupId>
<artifactId>amqp-client</artifactId>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
@ -94,16 +98,6 @@
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.collection;
package eu.dnetlib.collector.worker.model;
import java.util.HashMap;
import java.util.Map;

View File

@ -1,14 +0,0 @@
package eu.dnetlib.dhp.application;
import java.io.*;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import com.google.common.collect.Maps;
public class ApplicationUtils {
}

View File

@ -1,7 +1,10 @@
package eu.dnetlib.dhp.application;
import java.io.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Serializable;
import java.io.StringWriter;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
@ -9,21 +12,17 @@ import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.*;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class ArgumentApplicationParser implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
private final Options options = new Options();
private final Map<String, String> objectMap = new HashMap<>();
private final List<String> compressedValues = new ArrayList<>();
public ArgumentApplicationParser(final String json_configuration) throws IOException {
public ArgumentApplicationParser(final String json_configuration) throws Exception {
final ObjectMapper mapper = new ObjectMapper();
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
createOptionMap(configuration);
@ -34,6 +33,7 @@ public class ArgumentApplicationParser implements Serializable {
}
private void createOptionMap(final OptionsParameter[] configuration) {
Arrays
.stream(configuration)
.map(
@ -47,6 +47,10 @@ public class ArgumentApplicationParser implements Serializable {
return o;
})
.forEach(options::addOption);
// HelpFormatter formatter = new HelpFormatter();
// formatter.printHelp("myapp", null, options, null, true);
}
public static String decompressValue(final String abstractCompressed) {
@ -57,7 +61,7 @@ public class ArgumentApplicationParser implements Serializable {
IOUtils.copy(gis, stringWriter);
return stringWriter.toString();
} catch (Throwable e) {
log.error("Wrong value to decompress:" + abstractCompressed);
System.out.println("Wrong value to decompress:" + abstractCompressed);
throw new RuntimeException(e);
}
}
@ -70,7 +74,7 @@ public class ArgumentApplicationParser implements Serializable {
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
}
public void parseArgument(final String[] args) throws ParseException {
public void parseArgument(final String[] args) throws Exception {
CommandLineParser parser = new BasicParser();
CommandLine cmd = parser.parse(options, args);
Arrays

View File

@ -27,26 +27,4 @@ public class Constants {
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
}
public static final String SEQUENCE_FILE_NAME = "/sequence_file";
public static final String REPORT_FILE_NAME = "/report";
public static final String MDSTORE_DATA_PATH = "/store";
public static final String MDSTORE_SIZE_PATH = "/size";
public static final String COLLECTION_MODE = "collectionMode";
public static final String METADATA_ENCODING = "metadataEncoding";
public static final String OOZIE_WF_PATH = "oozieWfPath";
public static final String DNET_MESSAGE_MGR_URL = "dnetMessageManagerURL";
public static final String MAX_NUMBER_OF_RETRY = "maxNumberOfRetry";
public static final String REQUEST_DELAY = "requestDelay";
public static final String RETRY_DELAY = "retryDelay";
public static final String CONNECT_TIMEOUT = "connectTimeOut";
public static final String READ_TIMEOUT = "readTimeOut";
public static final String FROM_DATE_OVERRIDE = "fromDateOverride";
public static final String UNTIL_DATE_OVERRIDE = "untilDateOverride";
public static final String CONTENT_TOTALITEMS = "TotalItems";
public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems";
}

View File

@ -14,7 +14,7 @@ public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
private final Connection connection;
private Connection connection;
public DbClient(final String address, final String login, final String password) {

View File

@ -100,7 +100,7 @@ public class MakeTarArchive implements Serializable {
BufferedInputStream bis = new BufferedInputStream(is);
int count;
byte[] data = new byte[1024];
byte data[] = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count);
}

View File

@ -13,9 +13,9 @@ import okio.Source;
public class InputStreamRequestBody extends RequestBody {
private final InputStream inputStream;
private final MediaType mediaType;
private final long lenght;
private InputStream inputStream;
private MediaType mediaType;
private long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {

View File

@ -1,72 +0,0 @@
package eu.dnetlib.dhp.common.rest;
import java.util.Arrays;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class DNetRestClient {
private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
private static final ObjectMapper mapper = new ObjectMapper();
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet, clazz);
}
public static String doGET(final String url) throws Exception {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet);
}
public static <V> String doPOST(final String url, V objParam) throws Exception {
final HttpPost httpPost = new HttpPost(url);
if (objParam != null) {
final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam));
httpPost.setEntity(entity);
httpPost.setHeader("Accept", "application/json");
httpPost.setHeader("Content-type", "application/json");
}
return doHTTPRequest(httpPost);
}
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws Exception {
return mapper.readValue(doPOST(url, objParam), clazz);
}
private static String doHTTPRequest(final HttpUriRequest r) throws Exception {
CloseableHttpClient client = HttpClients.createDefault();
log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString());
log
.info(
"request headers: {}",
Arrays
.asList(r.getAllHeaders())
.stream()
.map(h -> h.getName() + ":" + h.getValue())
.collect(Collectors.joining(",")));
CloseableHttpResponse response = client.execute(r);
return IOUtils.toString(response.getEntity().getContent());
}
private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {
return mapper.readValue(doHTTPRequest(r), clazz);
}
}

View File

@ -1,64 +0,0 @@
package eu.dnetlib.dhp.message;
import java.io.Serializable;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
public class Message implements Serializable {
private static final long serialVersionUID = 401753881204524893L;
public static String CURRENT_PARAM = "current";
public static String TOTAL_PARAM = "total";
private MessageType messageType;
private String workflowId;
private Map<String, String> body;
public Message() {
}
public Message(final MessageType messageType, final String workflowId) {
this(messageType, workflowId, new LinkedHashMap<>());
}
public Message(final MessageType messageType, final String workflowId, final Map<String, String> body) {
this.messageType = messageType;
this.workflowId = workflowId;
this.body = body;
}
public MessageType getMessageType() {
return messageType;
}
public void setMessageType(MessageType messageType) {
this.messageType = messageType;
}
public String getWorkflowId() {
return workflowId;
}
public void setWorkflowId(final String workflowId) {
this.workflowId = workflowId;
}
public Map<String, String> getBody() {
return body;
}
public void setBody(final Map<String, String> body) {
this.body = body;
}
@Override
public String toString() {
return String.format("Message [type=%s, workflowId=%s, body=%s]", messageType, workflowId, body);
}
}

View File

@ -1,94 +0,0 @@
package eu.dnetlib.dhp.message;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class MessageSender {
private static final Logger log = LoggerFactory.getLogger(MessageSender.class);
private static final int SOCKET_TIMEOUT_MS = 2000;
private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000;
private static final int CONNTECTION_TIMEOUT_MS = 2000;
private final ObjectMapper objectMapper = new ObjectMapper();
private final String dnetMessageEndpoint;
private final String workflowId;
private final ExecutorService executorService = Executors.newCachedThreadPool();
public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
this.workflowId = workflowId;
this.dnetMessageEndpoint = dnetMessageEndpoint;
}
public void sendMessage(final Message message) {
executorService.submit(() -> _sendMessage(message));
}
public void sendMessage(final Long current, final Long total) {
sendMessage(createOngoingMessage(current, total));
}
public void sendReport(final Map<String, String> report) {
sendMessage(new Message(MessageType.REPORT, workflowId, report));
}
private Message createOngoingMessage(final Long current, final Long total) {
final Message m = new Message(MessageType.ONGOING, workflowId);
m.getBody().put(Message.CURRENT_PARAM, current.toString());
if (total != null) {
m.getBody().put(Message.TOTAL_PARAM, total.toString());
}
return m;
}
private void _sendMessage(final Message message) {
try {
final String json = objectMapper.writeValueAsString(message);
final HttpPut req = new HttpPut(dnetMessageEndpoint);
req.setEntity(new StringEntity(json, ContentType.APPLICATION_JSON));
final RequestConfig requestConfig = RequestConfig
.custom()
.setConnectTimeout(CONNTECTION_TIMEOUT_MS)
.setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS)
.setSocketTimeout(SOCKET_TIMEOUT_MS)
.build();
try (final CloseableHttpClient client = HttpClients
.custom()
.setDefaultRequestConfig(requestConfig)
.build();
final CloseableHttpResponse response = client.execute(req)) {
log.debug("Sent Message to " + dnetMessageEndpoint);
log.debug("MESSAGE:" + message);
} catch (final Throwable e) {
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
}
} catch (final JsonProcessingException e) {
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
}
}
}

View File

@ -1,21 +0,0 @@
package eu.dnetlib.dhp.message;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
public enum MessageType implements Serializable {
ONGOING, REPORT;
public MessageType from(String value) {
return Optional
.ofNullable(value)
.map(StringUtils::upperCase)
.map(MessageType::valueOf)
.orElseThrow(() -> new IllegalArgumentException("unknown message type: " + value));
}
}

View File

@ -0,0 +1,121 @@
package eu.dnetlib.dhp.model.mdstore;
import java.io.Serializable;
import eu.dnetlib.dhp.utils.DHPUtils;
/** This class models a record inside the new Metadata store collection on HDFS * */
public class MetadataRecord implements Serializable {
/** The D-Net Identifier associated to the record */
private String id;
/** The original Identifier of the record */
private String originalId;
/** The encoding of the record, should be JSON or XML */
private String encoding;
/**
* The information about the provenance of the record see @{@link Provenance} for the model of this information
*/
private Provenance provenance;
/** The content of the metadata */
private String body;
/** the date when the record has been stored */
private long dateOfCollection;
/** the date when the record has been stored */
private long dateOfTransformation;
public MetadataRecord() {
this.dateOfCollection = System.currentTimeMillis();
}
public MetadataRecord(
String originalId,
String encoding,
Provenance provenance,
String body,
long dateOfCollection) {
this.originalId = originalId;
this.encoding = encoding;
this.provenance = provenance;
this.body = body;
this.dateOfCollection = dateOfCollection;
this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix());
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getOriginalId() {
return originalId;
}
public void setOriginalId(String originalId) {
this.originalId = originalId;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public Provenance getProvenance() {
return provenance;
}
public void setProvenance(Provenance provenance) {
this.provenance = provenance;
}
public String getBody() {
return body;
}
public void setBody(String body) {
this.body = body;
}
public long getDateOfCollection() {
return dateOfCollection;
}
public void setDateOfCollection(long dateOfCollection) {
this.dateOfCollection = dateOfCollection;
}
public long getDateOfTransformation() {
return dateOfTransformation;
}
public void setDateOfTransformation(long dateOfTransformation) {
this.dateOfTransformation = dateOfTransformation;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof MetadataRecord)) {
return false;
}
return ((MetadataRecord) o).getId().equalsIgnoreCase(id);
}
@Override
public int hashCode() {
return id.hashCode();
}
}

View File

@ -0,0 +1,52 @@
package eu.dnetlib.dhp.model.mdstore;
import java.io.Serializable;
/**
* @author Sandro La Bruzzo
* <p>
* Provenace class models the provenance of the record in the metadataStore It contains the identifier and the
* name of the datasource that gives the record
*/
public class Provenance implements Serializable {
private String datasourceId;
private String datasourceName;
private String nsPrefix;
public Provenance() {
}
public Provenance(String datasourceId, String datasourceName, String nsPrefix) {
this.datasourceId = datasourceId;
this.datasourceName = datasourceName;
this.nsPrefix = nsPrefix;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(String datasourceId) {
this.datasourceId = datasourceId;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(String datasourceName) {
this.datasourceName = datasourceName;
}
public String getNsPrefix() {
return nsPrefix;
}
public void setNsPrefix(String nsPrefix) {
this.nsPrefix = nsPrefix;
}
}

View File

@ -1,9 +1,11 @@
package eu.dnetlib.dhp.schema.oaf.utils;
package eu.dnetlib.dhp.schema.oaf;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.function.Predicate;
@ -11,45 +13,42 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import com.google.common.base.Joiner;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.DHPUtils;
public class OafMapperUtils {
public static Oaf merge(final Oaf left, final Oaf right) {
if (ModelSupport.isSubClass(left, OafEntity.class)) {
return mergeEntities((OafEntity) left, (OafEntity) right);
} else if (ModelSupport.isSubClass(left, Relation.class)) {
((Relation) left).mergeFrom((Relation) right);
public static Oaf merge(final Oaf o1, final Oaf o2) {
if (ModelSupport.isSubClass(o1, OafEntity.class)) {
if (ModelSupport.isSubClass(o1, Result.class)) {
return mergeResults((Result) o1, (Result) o2);
} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
((Datasource) o1).mergeFrom((Datasource) o2);
} else if (ModelSupport.isSubClass(o1, Organization.class)) {
((Organization) o1).mergeFrom((Organization) o2);
} else if (ModelSupport.isSubClass(o1, Project.class)) {
((Project) o1).mergeFrom((Project) o2);
} else {
throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
}
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
((Relation) o1).mergeFrom((Relation) o2);
} else {
throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName());
throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName());
}
return left;
return o1;
}
public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
if (ModelSupport.isSubClass(left, Result.class)) {
return mergeResults((Result) left, (Result) right);
} else if (ModelSupport.isSubClass(left, Datasource.class)) {
left.mergeFrom(right);
} else if (ModelSupport.isSubClass(left, Organization.class)) {
left.mergeFrom(right);
} else if (ModelSupport.isSubClass(left, Project.class)) {
left.mergeFrom(right);
public static Result mergeResults(Result r1, Result r2) {
if (new ResultTypeComparator().compare(r1, r2) < 0) {
r1.mergeFrom(r2);
return r1;
} else {
throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
}
return left;
}
public static Result mergeResults(Result left, Result right) {
if (new ResultTypeComparator().compare(left, right) < 0) {
left.mergeFrom(right);
return left;
} else {
right.mergeFrom(left);
return right;
r2.mergeFrom(r1);
return r2;
}
}
@ -105,29 +104,6 @@ public class OafMapperUtils {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
return accessRight(classid, classname, schemeid, schemename, null);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename,
final OpenAccessRoute openAccessRoute) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(classid);
accessRight.setClassname(classname);
accessRight.setSchemeid(schemeid);
accessRight.setSchemename(schemename);
accessRight.setOpenAccessRoute(openAccessRoute);
return accessRight;
}
public static Qualifier qualifier(
final String classid,
final String classname,
@ -141,15 +117,6 @@ public class OafMapperUtils {
return q;
}
public static Qualifier qualifier(final Qualifier qualifier) {
final Qualifier q = new Qualifier();
q.setClassid(qualifier.getClassid());
q.setClassname(qualifier.getClassname());
q.setSchemeid(qualifier.getSchemeid());
q.setSchemename(qualifier.getSchemename());
return q;
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
@ -300,7 +267,7 @@ public class OafMapperUtils {
} else if (to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest));
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
} else {
return String.format("%s|%s", prefix, originalId);
}
@ -333,36 +300,4 @@ public class OafMapperUtils {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
}
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<AccessRight> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new AccessRightComparator<>());
final Qualifier rights = min.isPresent() ? qualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
}

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.schema.oaf;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class ResultTypeComparator implements Comparator<Result> {
@Override
public int compare(Result left, Result right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
String lClass = left.getResulttype().getClassid();
String rClass = right.getResulttype().getClassid();
if (lClass.equals(rClass))
return 0;
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

View File

@ -1,29 +1,18 @@
package eu.dnetlib.dhp.utils;
import java.io.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.binary.Base64OutputStream;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import com.jayway.jsonpath.JsonPath;
import net.minidev.json.JSONArray;
@ -32,8 +21,6 @@ import scala.collection.Seq;
public class DHPUtils {
private static final Logger log = LoggerFactory.getLogger(DHPUtils.class);
public static Seq<String> toSeq(List<String> list) {
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
}
@ -92,72 +79,4 @@ public class DHPUtils {
return "";
}
}
public static final ObjectMapper MAPPER = new ObjectMapper();
public static void writeHdfsFile(final Configuration conf, final String content, final String path)
throws IOException {
log.info("writing file {}, size {}", path, content.length());
try (FileSystem fs = FileSystem.get(conf);
BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
os.write(content.getBytes(StandardCharsets.UTF_8));
os.flush();
}
}
public static String readHdfsFile(Configuration conf, String path) throws IOException {
log.info("reading file {}", path);
try (FileSystem fs = FileSystem.get(conf)) {
final Path p = new Path(path);
if (!fs.exists(p)) {
throw new FileNotFoundException(path);
}
return IOUtils.toString(fs.open(p));
}
}
public static <T> T readHdfsFileAs(Configuration conf, String path, Class<T> clazz) throws IOException {
return MAPPER.readValue(readHdfsFile(conf, path), clazz);
}
public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
log.info("saving dataset in: {}", targetPath);
mdstore
.write()
.mode(SaveMode.Overwrite)
.format("parquet")
.save(targetPath);
}
public static Configuration getHadoopConfiguration(String nameNode) {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", nameNode);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("hadoop.home.dir", "/");
return conf;
}
public static void populateOOZIEEnv(final Map<String, String> report) throws IOException {
File file = new File(System.getProperty("oozie.action.output.properties"));
Properties props = new Properties();
report.forEach((k, v) -> props.setProperty(k, v));
try (OutputStream os = new FileOutputStream(file)) {
props.store(os, "");
}
}
public static void populateOOZIEEnv(final String paramName, String value) throws IOException {
Map<String, String> report = Maps.newHashMap();
report.put(paramName, value);
populateOOZIEEnv(report);
}
}

View File

@ -15,8 +15,8 @@ public class ISLookupClientFactory {
private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class);
private static final int requestTimeout = 60000 * 10;
private static final int connectTimeout = 60000 * 10;
private static int requestTimeout = 60000 * 10;
private static int connectTimeout = 60000 * 10;
public static ISLookUpService getLookUpService(final String isLookupUrl) {
return getServiceStub(ISLookUpService.class, isLookupUrl);

View File

@ -0,0 +1,76 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.util.Map;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class Message {
private String workflowId;
private String jobName;
private MessageType type;
private Map<String, String> body;
public static Message fromJson(final String json) throws IOException {
final ObjectMapper jsonMapper = new ObjectMapper();
return jsonMapper.readValue(json, Message.class);
}
public Message() {
}
public Message(String workflowId, String jobName, MessageType type, Map<String, String> body) {
this.workflowId = workflowId;
this.jobName = jobName;
this.type = type;
this.body = body;
}
public String getWorkflowId() {
return workflowId;
}
public void setWorkflowId(String workflowId) {
this.workflowId = workflowId;
}
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public MessageType getType() {
return type;
}
public void setType(MessageType type) {
this.type = type;
}
public Map<String, String> getBody() {
return body;
}
public void setBody(Map<String, String> body) {
this.body = body;
}
@Override
public String toString() {
final ObjectMapper jsonMapper = new ObjectMapper();
try {
return jsonMapper.writeValueAsString(this);
} catch (JsonProcessingException e) {
return null;
}
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.LinkedBlockingQueue;
import com.rabbitmq.client.AMQP;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.DefaultConsumer;
import com.rabbitmq.client.Envelope;
public class MessageConsumer extends DefaultConsumer {
final LinkedBlockingQueue<Message> queueMessages;
/**
* Constructs a new instance and records its association to the passed-in channel.
*
* @param channel the channel to which this consumer is attached
* @param queueMessages
*/
public MessageConsumer(Channel channel, LinkedBlockingQueue<Message> queueMessages) {
super(channel);
this.queueMessages = queueMessages;
}
@Override
public void handleDelivery(
String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body)
throws IOException {
final String json = new String(body, StandardCharsets.UTF_8);
Message message = Message.fromJson(json);
try {
this.queueMessages.put(message);
System.out.println("Receiving Message " + message);
} catch (InterruptedException e) {
if (message.getType() == MessageType.REPORT)
throw new RuntimeException("Error on sending message");
else {
// TODO LOGGING EXCEPTION
}
} finally {
getChannel().basicAck(envelope.getDeliveryTag(), false);
}
}
}

View File

@ -0,0 +1,136 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeoutException;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.Connection;
import com.rabbitmq.client.ConnectionFactory;
public class MessageManager {
private final String messageHost;
private final String username;
private final String password;
private Connection connection;
private final Map<String, Channel> channels = new HashMap<>();
private boolean durable;
private boolean autodelete;
private final LinkedBlockingQueue<Message> queueMessages;
public MessageManager(
String messageHost,
String username,
String password,
final LinkedBlockingQueue<Message> queueMessages) {
this.queueMessages = queueMessages;
this.messageHost = messageHost;
this.username = username;
this.password = password;
}
public MessageManager(
String messageHost,
String username,
String password,
boolean durable,
boolean autodelete,
final LinkedBlockingQueue<Message> queueMessages) {
this.queueMessages = queueMessages;
this.messageHost = messageHost;
this.username = username;
this.password = password;
this.durable = durable;
this.autodelete = autodelete;
}
private Connection createConnection() throws IOException, TimeoutException {
ConnectionFactory factory = new ConnectionFactory();
factory.setHost(this.messageHost);
factory.setUsername(this.username);
factory.setPassword(this.password);
return factory.newConnection();
}
private Channel createChannel(
final Connection connection,
final String queueName,
final boolean durable,
final boolean autodelete)
throws Exception {
Map<String, Object> args = new HashMap<>();
args.put("x-message-ttl", 10000);
Channel channel = connection.createChannel();
channel.queueDeclare(queueName, durable, false, this.autodelete, args);
return channel;
}
private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete)
throws Exception {
if (channels.containsKey(queueName)) {
return channels.get(queueName);
}
if (this.connection == null) {
this.connection = createConnection();
}
channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete));
return channels.get(queueName);
}
public void close() throws IOException {
channels
.values()
.forEach(
ch -> {
try {
ch.close();
} catch (Exception e) {
// TODO LOG
}
});
this.connection.close();
}
public boolean sendMessage(final Message message, String queueName) throws Exception {
try {
Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete);
channel.basicPublish("", queueName, null, message.toString().getBytes());
return true;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
public boolean sendMessage(
final Message message, String queueName, boolean durable_var, boolean autodelete_var)
throws Exception {
try {
Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var);
channel.basicPublish("", queueName, null, message.toString().getBytes());
return true;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
public void startConsumingMessage(
final String queueName, final boolean durable, final boolean autodelete) throws Exception {
Channel channel = createChannel(createConnection(), queueName, durable, autodelete);
channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages));
}
}

View File

@ -0,0 +1,6 @@
package eu.dnetlib.message;
public enum MessageType {
ONGOING, REPORT
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,16 @@
package eu.dnetlib.dhp.model.mdstore;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Test;
public class MetadataRecordTest {
@Test
public void getTimestamp() {
MetadataRecord r = new MetadataRecord();
assertTrue(r.getDateOfCollection() > 0);
}
}

View File

@ -1,69 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
public class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testMergePubs() throws IOException {
Publication p1 = read("publication_1.json", Publication.class);
Publication p2 = read("publication_2.json", Publication.class);
Dataset d1 = read("dataset_1.json", Dataset.class);
Dataset d2 = read("dataset_2.json", Dataset.class);
assertEquals(p1.getCollectedfrom().size(), 1);
assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID);
assertEquals(d2.getCollectedfrom().size(), 1);
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertTrue(
OafMapperUtils
.mergeResults(p1, d2)
.getResulttype()
.getClassid()
.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID));
assertEquals(p2.getCollectedfrom().size(), 1);
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(d1.getCollectedfrom().size(), 1);
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertTrue(
OafMapperUtils
.mergeResults(p2, d1)
.getResulttype()
.getClassid()
.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID));
}
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new));
}
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
return OBJECT_MAPPER.readValue(json, clazz);
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.message;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
public class MessageTest {
@Test
public void fromJsonTest() throws IOException {
Message m = new Message();
m.setWorkflowId("wId");
m.setType(MessageType.ONGOING);
m.setJobName("Collection");
Map<String, String> body = new HashMap<>();
body.put("parsedItem", "300");
body.put("ExecutionTime", "30s");
m.setBody(body);
System.out.println("m = " + m);
Message m1 = Message.fromJson(m.toString());
assertEquals(m1.getWorkflowId(), m.getWorkflowId());
assertEquals(m1.getType(), m.getType());
assertEquals(m1.getJobName(), m.getJobName());
assertNotNull(m1.getBody());
m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it)));
assertEquals(m1.getJobName(), m.getJobName());
}
@Test
public void toStringTest() {
final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}";
Message m = new Message();
m.setWorkflowId("wId");
m.setType(MessageType.ONGOING);
m.setJobName("Collection");
Map<String, String> body = new HashMap<>();
body.put("parsedItem", "300");
body.put("ExecutionTime", "30s");
m.setBody(body);
assertEquals(expectedJson, m.toString());
}
}

View File

@ -1 +0,0 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

View File

@ -1 +0,0 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}

View File

@ -1 +0,0 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

View File

@ -1 +0,0 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]}

View File

@ -51,6 +51,16 @@
<artifactId>hadoop-distcp</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-data-protos</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>

View File

@ -0,0 +1,69 @@
package eu.dnetlib.dhp.actionmanager.migration;
import java.util.Comparator;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
public class LicenseComparator implements Comparator<Qualifier> {
@Override
public int compare(Qualifier left, Qualifier right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
String lClass = left.getClassid();
String rClass = right.getClassid();
if (lClass.equals(rClass))
return 0;
if (lClass.equals("OPEN SOURCE"))
return -1;
if (rClass.equals("OPEN SOURCE"))
return 1;
if (lClass.equals("OPEN"))
return -1;
if (rClass.equals("OPEN"))
return 1;
if (lClass.equals("6MONTHS"))
return -1;
if (rClass.equals("6MONTHS"))
return 1;
if (lClass.equals("12MONTHS"))
return -1;
if (rClass.equals("12MONTHS"))
return 1;
if (lClass.equals("EMBARGO"))
return -1;
if (rClass.equals("EMBARGO"))
return 1;
if (lClass.equals("RESTRICTED"))
return -1;
if (rClass.equals("RESTRICTED"))
return 1;
if (lClass.equals("CLOSED"))
return -1;
if (rClass.equals("CLOSED"))
return 1;
if (lClass.equals("UNKNOWN"))
return -1;
if (rClass.equals("UNKNOWN"))
return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

View File

@ -0,0 +1,196 @@
package eu.dnetlib.dhp.actionmanager.migration;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.DistCp;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class MigrateActionSet {
private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class);
private static final String SEPARATOR = "/";
private static final String TARGET_PATHS = "target_paths";
private static final String RAWSET_PREFIX = "rawset_";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
MigrateActionSet.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json")));
parser.parseArgument(args);
new MigrateActionSet().run(parser);
}
private void run(ArgumentApplicationParser parser) throws Exception {
final String isLookupUrl = parser.get("isLookupUrl");
final String sourceNN = parser.get("sourceNameNode");
final String targetNN = parser.get("targetNameNode");
final String workDir = parser.get("workingDirectory");
final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps"));
final String distcp_memory_mb = parser.get("distcp_memory_mb");
final String distcp_task_timeout = parser.get("distcp_task_timeout");
final String transform_only_s = parser.get("transform_only");
log.info("transform only param: {}", transform_only_s);
final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only"));
log.info("transform only: {}", transformOnly);
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
FileSystem targetFS = FileSystem.get(conf);
Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN);
FileSystem sourceFS = FileSystem.get(sourceConf);
Properties props = new Properties();
List<Path> targetPaths = new ArrayList<>();
final List<Path> sourcePaths = getSourcePaths(sourceNN, isLookUp);
log
.info(
"paths to process:\n{}", sourcePaths
.stream()
.map(p -> p.toString())
.collect(Collectors.joining("\n")));
for (Path source : sourcePaths) {
if (!sourceFS.exists(source)) {
log.warn("skipping unexisting path: {}", source);
} else {
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath()));
final String rawSet = pathQ.pollLast();
log.info("got RAWSET: {}", rawSet);
if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) {
final String actionSetDirectory = pathQ.pollLast();
final Path targetPath = new Path(
targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet);
log.info("using TARGET PATH: {}", targetPath);
if (!transformOnly) {
if (targetFS.exists(targetPath)) {
targetFS.delete(targetPath, true);
}
runDistcp(
distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath);
}
targetPaths.add(targetPath);
}
}
}
final String targetPathsCsv = targetPaths.stream().map(p -> p.toString()).collect(Collectors.joining(","));
props.setProperty(TARGET_PATHS, targetPathsCsv);
File file = new File(System.getProperty("oozie.action.output.properties"));
try (OutputStream os = new FileOutputStream(file)) {
props.store(os, "");
}
System.out.println(file.getAbsolutePath());
}
private void runDistcp(
Integer distcp_num_maps,
String distcp_memory_mb,
String distcp_task_timeout,
Configuration conf,
Path source,
Path targetPath)
throws Exception {
final DistCpOptions op = new DistCpOptions(source, targetPath);
op.setMaxMaps(distcp_num_maps);
op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE);
op.preserve(DistCpOptions.FileAttribute.REPLICATION);
op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE);
int res = ToolRunner
.run(
new DistCp(conf, op),
new String[] {
"-Dmapred.task.timeout=" + distcp_task_timeout,
"-Dmapreduce.map.memory.mb=" + distcp_memory_mb,
"-pb",
"-m " + distcp_num_maps,
source.toString(),
targetPath.toString()
});
if (res != 0) {
throw new RuntimeException(String.format("distcp exited with code %s", res));
}
}
private Configuration getConfiguration(
String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) {
final Configuration conf = new Configuration();
conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout);
conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout);
conf.set("dfs.http.client.retry.policy.enabled", "true");
conf.set("mapred.task.timeout", distcp_task_timeout);
conf.set("mapreduce.map.memory.mb", distcp_memory_mb);
conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps));
return conf;
}
private List<Path> getSourcePaths(String sourceNN, ISLookUpService isLookUp)
throws ISLookUpException {
String XQUERY = "distinct-values(\n"
+ "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n"
+ "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n"
+ "let $setDir := $x//SET/@directory/string()\n"
+ "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n"
+ "return concat($basePath, '/', $setDir, '/', $rawSet))";
log.info(String.format("running xquery:\n%s", XQUERY));
return isLookUp
.quickSearchProfile(XQUERY)
.stream()
.map(p -> sourceNN + p)
.map(Path::new)
.collect(Collectors.toList());
}
}

View File

@ -0,0 +1,710 @@
package eu.dnetlib.dhp.actionmanager.migration;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.data.proto.*;
import eu.dnetlib.dhp.schema.oaf.*;
public class ProtoConverter implements Serializable {
public static Oaf convert(OafProtos.Oaf oaf) {
try {
switch (oaf.getKind()) {
case entity:
return convertEntity(oaf);
case relation:
return convertRelation(oaf);
default:
throw new IllegalArgumentException("invalid kind " + oaf.getKind());
}
} catch (Throwable e) {
throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e);
}
}
private static Relation convertRelation(OafProtos.Oaf oaf) {
final OafProtos.OafRel r = oaf.getRel();
final Relation rel = new Relation();
rel.setDataInfo(mapDataInfo(oaf.getDataInfo()));
rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp());
rel.setSource(r.getSource());
rel.setTarget(r.getTarget());
rel.setRelType(r.getRelType().toString());
rel.setSubRelType(r.getSubRelType().toString());
rel.setRelClass(r.getRelClass());
rel
.setCollectedfrom(
r.getCollectedfromCount() > 0
? r.getCollectedfromList().stream().map(kv -> mapKV(kv)).collect(Collectors.toList())
: null);
return rel;
}
private static OafEntity convertEntity(OafProtos.Oaf oaf) {
switch (oaf.getEntity().getType()) {
case result:
final Result r = convertResult(oaf);
r.setInstance(convertInstances(oaf));
r.setExternalReference(convertExternalRefs(oaf));
return r;
case project:
return convertProject(oaf);
case datasource:
return convertDataSource(oaf);
case organization:
return convertOrganization(oaf);
default:
throw new RuntimeException("received unknown type");
}
}
private static List<Instance> convertInstances(OafProtos.Oaf oaf) {
final ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getInstanceCount() > 0) {
return r.getInstanceList().stream().map(i -> convertInstance(i)).collect(Collectors.toList());
}
return Lists.newArrayList();
}
private static Instance convertInstance(ResultProtos.Result.Instance ri) {
final Instance i = new Instance();
i.setAccessright(mapQualifier(ri.getAccessright()));
i.setCollectedfrom(mapKV(ri.getCollectedfrom()));
i.setDateofacceptance(mapStringField(ri.getDateofacceptance()));
i.setDistributionlocation(ri.getDistributionlocation());
i.setHostedby(mapKV(ri.getHostedby()));
i.setInstancetype(mapQualifier(ri.getInstancetype()));
i.setLicense(mapStringField(ri.getLicense()));
i
.setUrl(
ri.getUrlList() != null ? ri
.getUrlList()
.stream()
.distinct()
.collect(Collectors.toCollection(ArrayList::new)) : null);
i.setRefereed(mapRefereed(ri.getRefereed()));
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
return i;
}
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
Qualifier q = new Qualifier();
q.setClassid(refereed.getValue());
q.setSchemename(refereed.getValue());
q.setSchemeid("dnet:review_levels");
q.setSchemename("dnet:review_levels");
return q;
}
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getExternalReferenceCount() > 0) {
return r
.getExternalReferenceList()
.stream()
.map(e -> convertExtRef(e))
.collect(Collectors.toList());
}
return Lists.newArrayList();
}
private static ExternalReference convertExtRef(ResultProtos.Result.ExternalReference e) {
ExternalReference ex = new ExternalReference();
ex.setUrl(e.getUrl());
ex.setSitename(e.getSitename());
ex.setRefidentifier(e.getRefidentifier());
ex.setQuery(e.getQuery());
ex.setQualifier(mapQualifier(e.getQualifier()));
ex.setLabel(e.getLabel());
ex.setDescription(e.getDescription());
ex.setDataInfo(ex.getDataInfo());
return ex;
}
private static Organization convertOrganization(OafProtos.Oaf oaf) {
final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata();
final Organization org = setOaf(new Organization(), oaf);
setEntity(org, oaf);
org.setLegalshortname(mapStringField(m.getLegalshortname()));
org.setLegalname(mapStringField(m.getLegalname()));
org
.setAlternativeNames(
m
.getAlternativeNamesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
org.setWebsiteurl(mapStringField(m.getWebsiteurl()));
org.setLogourl(mapStringField(m.getLogourl()));
org.setEclegalbody(mapStringField(m.getEclegalbody()));
org.setEclegalperson(mapStringField(m.getEclegalperson()));
org.setEcnonprofit(mapStringField(m.getEcnonprofit()));
org.setEcresearchorganization(mapStringField(m.getEcresearchorganization()));
org.setEchighereducation(mapStringField(m.getEchighereducation()));
org
.setEcinternationalorganizationeurinterests(
mapStringField(m.getEcinternationalorganizationeurinterests()));
org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization()));
org.setEcenterprise(mapStringField(m.getEcenterprise()));
org.setEcsmevalidated(mapStringField(m.getEcsmevalidated()));
org.setEcnutscode(mapStringField(m.getEcnutscode()));
org.setCountry(mapQualifier(m.getCountry()));
return org;
}
private static Datasource convertDataSource(OafProtos.Oaf oaf) {
final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata();
final Datasource datasource = setOaf(new Datasource(), oaf);
setEntity(datasource, oaf);
datasource
.setAccessinfopackage(
m
.getAccessinfopackageList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setCertificates(mapStringField(m.getCertificates()));
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
datasource.setContactemail(mapStringField(m.getContactemail()));
datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction()));
datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype()));
datasource.setDataprovider(mapBoolField(m.getDataprovider()));
datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype()));
datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction()));
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
datasource.setDatauploadtype(mapStringField(m.getDatauploadtype()));
datasource.setDateofvalidation(mapStringField(m.getDateofvalidation()));
datasource.setDescription(mapStringField(m.getDescription()));
datasource.setEnglishname(mapStringField(m.getEnglishname()));
datasource.setLatitude(mapStringField(m.getLatitude()));
datasource.setLongitude(mapStringField(m.getLongitude()));
datasource.setLogourl(mapStringField(m.getLogourl()));
datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl()));
datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix()));
datasource
.setOdcontenttypes(
m
.getOdcontenttypesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource
.setOdlanguages(
m
.getOdlanguagesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems()));
datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate()));
datasource.setOdpolicies(mapStringField(m.getOdpolicies()));
datasource.setOfficialname(mapStringField(m.getOfficialname()));
datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility()));
datasource.setPidsystems(mapStringField(m.getPidsystems()));
datasource
.setPolicies(
m.getPoliciesList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList()));
datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind()));
datasource.setReleaseenddate(mapStringField(m.getReleaseenddate()));
datasource.setServiceprovider(mapBoolField(m.getServiceprovider()));
datasource.setReleasestartdate(mapStringField(m.getReleasestartdate()));
datasource
.setSubjects(
m
.getSubjectsList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
datasource.setVersioning(mapBoolField(m.getVersioning()));
datasource.setWebsiteurl(mapStringField(m.getWebsiteurl()));
datasource.setJournal(mapJournal(m.getJournal()));
return datasource;
}
private static Project convertProject(OafProtos.Oaf oaf) {
final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata();
final Project project = setOaf(new Project(), oaf);
setEntity(project, oaf);
project.setAcronym(mapStringField(m.getAcronym()));
project.setCallidentifier(mapStringField(m.getCallidentifier()));
project.setCode(mapStringField(m.getCode()));
project.setContactemail(mapStringField(m.getContactemail()));
project.setContactfax(mapStringField(m.getContactfax()));
project.setContactfullname(mapStringField(m.getContactfullname()));
project.setContactphone(mapStringField(m.getContactphone()));
project.setContracttype(mapQualifier(m.getContracttype()));
project.setCurrency(mapStringField(m.getCurrency()));
project.setDuration(mapStringField(m.getDuration()));
project.setEcarticle29_3(mapStringField(m.getEcarticle293()));
project.setEcsc39(mapStringField(m.getEcsc39()));
project.setOamandatepublications(mapStringField(m.getOamandatepublications()));
project.setStartdate(mapStringField(m.getStartdate()));
project.setEnddate(mapStringField(m.getEnddate()));
project.setFundedamount(m.getFundedamount());
project.setTotalcost(m.getTotalcost());
project.setKeywords(mapStringField(m.getKeywords()));
project
.setSubjects(
m
.getSubjectsList()
.stream()
.map(sp -> mapStructuredProperty(sp))
.collect(Collectors.toList()));
project.setTitle(mapStringField(m.getTitle()));
project.setWebsiteurl(mapStringField(m.getWebsiteurl()));
project
.setFundingtree(
m.getFundingtreeList().stream().map(f -> mapStringField(f)).collect(Collectors.toList()));
project.setJsonextrainfo(mapStringField(m.getJsonextrainfo()));
project.setSummary(mapStringField(m.getSummary()));
project.setOptional1(mapStringField(m.getOptional1()));
project.setOptional2(mapStringField(m.getOptional2()));
return project;
}
private static Result convertResult(OafProtos.Oaf oaf) {
switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) {
case "dataset":
return createDataset(oaf);
case "publication":
return createPublication(oaf);
case "software":
return createSoftware(oaf);
case "other":
return createORP(oaf);
default:
Result result = setOaf(new Result(), oaf);
setEntity(result, oaf);
return setResult(result, oaf);
}
}
private static Software createSoftware(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Software software = setOaf(new Software(), oaf);
setEntity(software, oaf);
setResult(software, oaf);
software
.setDocumentationUrl(
m
.getDocumentationUrlList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
software
.setLicense(
m
.getLicenseList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl()));
software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage()));
return software;
}
private static OtherResearchProduct createORP(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf);
setEntity(otherResearchProducts, oaf);
setResult(otherResearchProducts, oaf);
otherResearchProducts
.setContactperson(
m
.getContactpersonList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
otherResearchProducts
.setContactgroup(
m
.getContactgroupList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
otherResearchProducts
.setTool(
m.getToolList().stream().map(ProtoConverter::mapStringField).collect(Collectors.toList()));
return otherResearchProducts;
}
private static Publication createPublication(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Publication publication = setOaf(new Publication(), oaf);
setEntity(publication, oaf);
setResult(publication, oaf);
publication.setJournal(mapJournal(m.getJournal()));
return publication;
}
private static Dataset createDataset(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Dataset dataset = setOaf(new Dataset(), oaf);
setEntity(dataset, oaf);
setResult(dataset, oaf);
dataset.setStoragedate(mapStringField(m.getStoragedate()));
dataset.setDevice(mapStringField(m.getDevice()));
dataset.setSize(mapStringField(m.getSize()));
dataset.setVersion(mapStringField(m.getVersion()));
dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate()));
dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber()));
dataset
.setGeolocation(
m
.getGeolocationList()
.stream()
.map(ProtoConverter::mapGeolocation)
.collect(Collectors.toList()));
return dataset;
}
public static <T extends Oaf> T setOaf(T oaf, OafProtos.Oaf o) {
oaf.setDataInfo(mapDataInfo(o.getDataInfo()));
oaf.setLastupdatetimestamp(o.getLastupdatetimestamp());
return oaf;
}
public static <T extends OafEntity> T setEntity(T entity, OafProtos.Oaf oaf) {
// setting Entity fields
final OafProtos.OafEntity e = oaf.getEntity();
entity.setId(e.getId());
entity.setOriginalId(e.getOriginalIdList());
entity
.setCollectedfrom(
e.getCollectedfromList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList()));
entity
.setPid(
e
.getPidList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setDateofcollection(e.getDateofcollection());
entity.setDateoftransformation(e.getDateoftransformation());
entity
.setExtraInfo(
e
.getExtraInfoList()
.stream()
.map(ProtoConverter::mapExtraInfo)
.collect(Collectors.toList()));
return entity;
}
public static <T extends Result> T setResult(T entity, OafProtos.Oaf oaf) {
// setting Entity fields
final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
entity
.setAuthor(
m.getAuthorList().stream().map(ProtoConverter::mapAuthor).collect(Collectors.toList()));
entity.setResulttype(mapQualifier(m.getResulttype()));
entity.setLanguage(mapQualifier(m.getLanguage()));
entity
.setCountry(
m
.getCountryList()
.stream()
.map(ProtoConverter::mapQualifierAsCountry)
.collect(Collectors.toList()));
entity
.setSubject(
m
.getSubjectList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity
.setTitle(
m
.getTitleList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity
.setRelevantdate(
m
.getRelevantdateList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity
.setDescription(
m
.getDescriptionList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setDateofacceptance(mapStringField(m.getDateofacceptance()));
entity.setPublisher(mapStringField(m.getPublisher()));
entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate()));
entity
.setSource(
m
.getSourceList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setFulltext(
m
.getFulltextList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setFormat(
m
.getFormatList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setContributor(
m
.getContributorList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setResourcetype(mapQualifier(m.getResourcetype()));
entity
.setCoverage(
m
.getCoverageList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setContext(
m.getContextList().stream().map(ProtoConverter::mapContext).collect(Collectors.toList()));
entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList()));
return entity;
}
private static Qualifier getBestAccessRights(List<ResultProtos.Result.Instance> instanceList) {
if (instanceList != null) {
final Optional<FieldTypeProtos.Qualifier> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new LicenseComparator());
final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
private static Context mapContext(ResultProtos.Result.Context context) {
if (context == null || StringUtils.isBlank(context.getId())) {
return null;
}
final Context entity = new Context();
entity.setId(context.getId());
entity
.setDataInfo(
context
.getDataInfoList()
.stream()
.map(ProtoConverter::mapDataInfo)
.collect(Collectors.toList()));
return entity;
}
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) {
return null;
}
final KeyValue keyValue = new KeyValue();
keyValue.setKey(kv.getKey());
keyValue.setValue(kv.getValue());
keyValue.setDataInfo(mapDataInfo(kv.getDataInfo()));
return keyValue;
}
public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) {
final DataInfo dataInfo = new DataInfo();
dataInfo.setDeletedbyinference(d.getDeletedbyinference());
dataInfo.setInferenceprovenance(d.getInferenceprovenance());
dataInfo.setInferred(d.getInferred());
dataInfo.setInvisible(d.getInvisible());
dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction()));
dataInfo.setTrust(d.getTrust());
return dataInfo;
}
public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) {
final Qualifier qualifier = new Qualifier();
qualifier.setClassid(q.getClassid());
qualifier.setClassname(q.getClassname());
qualifier.setSchemeid(q.getSchemeid());
qualifier.setSchemename(q.getSchemename());
return qualifier;
}
public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) {
final Country c = new Country();
c.setClassid(q.getClassid());
c.setClassname(q.getClassname());
c.setSchemeid(q.getSchemeid());
c.setSchemename(q.getSchemename());
c.setDataInfo(mapDataInfo(q.getDataInfo()));
return c;
}
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
if (sp == null | StringUtils.isBlank(sp.getValue())) {
return null;
}
final StructuredProperty structuredProperty = new StructuredProperty();
structuredProperty.setValue(sp.getValue());
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo()));
return structuredProperty;
}
public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) {
final ExtraInfo entity = new ExtraInfo();
entity.setName(extraInfo.getName());
entity.setTypology(extraInfo.getTypology());
entity.setProvenance(extraInfo.getProvenance());
entity.setTrust(extraInfo.getTrust());
entity.setValue(extraInfo.getValue());
return entity;
}
public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) {
final OAIProvenance entity = new OAIProvenance();
entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription()));
return entity;
}
public static OriginDescription mapOriginalDescription(
FieldTypeProtos.OAIProvenance.OriginDescription originDescription) {
final OriginDescription originDescriptionResult = new OriginDescription();
originDescriptionResult.setHarvestDate(originDescription.getHarvestDate());
originDescriptionResult.setAltered(originDescription.getAltered());
originDescriptionResult.setBaseURL(originDescription.getBaseURL());
originDescriptionResult.setIdentifier(originDescription.getIdentifier());
originDescriptionResult.setDatestamp(originDescription.getDatestamp());
originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace());
return originDescriptionResult;
}
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
if (s == null || StringUtils.isBlank(s.getValue())) {
return null;
}
final Field<String> stringField = new Field<>();
stringField.setValue(s.getValue());
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
return stringField;
}
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
if (b == null) {
return null;
}
final Field<Boolean> booleanField = new Field<>();
booleanField.setValue(b.getValue());
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
return booleanField;
}
public static Journal mapJournal(FieldTypeProtos.Journal j) {
final Journal journal = new Journal();
journal.setConferencedate(j.getConferencedate());
journal.setConferenceplace(j.getConferenceplace());
journal.setEdition(j.getEdition());
journal.setEp(j.getEp());
journal.setIss(j.getIss());
journal.setIssnLinking(j.getIssnLinking());
journal.setIssnOnline(j.getIssnOnline());
journal.setIssnPrinted(j.getIssnPrinted());
journal.setName(j.getName());
journal.setSp(j.getSp());
journal.setVol(j.getVol());
journal.setDataInfo(mapDataInfo(j.getDataInfo()));
return journal;
}
public static Author mapAuthor(FieldTypeProtos.Author author) {
final Author entity = new Author();
entity.setFullname(author.getFullname());
entity.setName(author.getName());
entity.setSurname(author.getSurname());
entity.setRank(author.getRank());
entity
.setPid(
author
.getPidList()
.stream()
.map(
kv -> {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(kv.getValue());
final Qualifier q = new Qualifier();
q.setClassid(kv.getKey());
q.setClassname(kv.getKey());
sp.setQualifier(q);
return sp;
})
.collect(Collectors.toList()));
entity
.setAffiliation(
author
.getAffiliationList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
return entity;
}
public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) {
final GeoLocation entity = new GeoLocation();
entity.setPoint(geoLocation.getPoint());
entity.setBox(geoLocation.getBox());
entity.setPlace(geoLocation.getPlace());
return entity;
}
}

View File

@ -0,0 +1,172 @@
package eu.dnetlib.dhp.actionmanager.migration;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.LinkedList;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.protobuf.InvalidProtocolBufferException;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class TransformActions implements Serializable {
private static final Logger log = LoggerFactory.getLogger(TransformActions.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String SEPARATOR = "/";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
MigrateActionSet.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String inputPaths = parser.get("inputPaths");
if (StringUtils.isBlank(inputPaths)) {
throw new RuntimeException("empty inputPaths");
}
log.info("inputPaths: {}", inputPaths);
final String targetBaseDir = getTargetBaseDir(isLookupUrl);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf, isSparkSessionManaged, spark -> transformActions(inputPaths, targetBaseDir, spark));
}
private static void transformActions(String inputPaths, String targetBaseDir, SparkSession spark)
throws IOException {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
final String rawset = pathQ.pollLast();
final String actionSetDirectory = pathQ.pollLast();
final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset);
if (fs.exists(targetDirectory)) {
log.info("found target directory '{}", targetDirectory);
fs.delete(targetDirectory, true);
log.info("deleted target directory '{}", targetDirectory);
}
log.info("transforming actions from '{}' to '{}'", sourcePath, targetDirectory);
sc
.sequenceFile(sourcePath, Text.class, Text.class)
.map(a -> eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString()))
.map(TransformActions::doTransform)
.filter(Objects::nonNull)
.mapToPair(
a -> new Tuple2<>(a.getClazz().toString(), OBJECT_MAPPER.writeValueAsString(a)))
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
.saveAsNewAPIHadoopFile(
targetDirectory.toString(),
Text.class,
Text.class,
SequenceFileOutputFormat.class,
sc.hadoopConfiguration());
}
}
private static AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa)
throws InvalidProtocolBufferException {
// dedup similarity relations had empty target value, don't migrate them
if (aa.getTargetValue().length == 0) {
return null;
}
final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue());
final Oaf oaf = ProtoConverter.convert(proto_oaf);
switch (proto_oaf.getKind()) {
case entity:
switch (proto_oaf.getEntity().getType()) {
case datasource:
return new AtomicAction<>(Datasource.class, (Datasource) oaf);
case organization:
return new AtomicAction<>(Organization.class, (Organization) oaf);
case project:
return new AtomicAction<>(Project.class, (Project) oaf);
case result:
final String resulttypeid = proto_oaf
.getEntity()
.getResult()
.getMetadata()
.getResulttype()
.getClassid();
switch (resulttypeid) {
case "publication":
return new AtomicAction<>(Publication.class, (Publication) oaf);
case "software":
return new AtomicAction<>(Software.class, (Software) oaf);
case "other":
return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf);
case "dataset":
return new AtomicAction<>(Dataset.class, (Dataset) oaf);
default:
// can be an update, where the resulttype is not specified
return new AtomicAction<>(Result.class, (Result) oaf);
}
default:
throw new IllegalArgumentException(
"invalid entity type: " + proto_oaf.getEntity().getType());
}
case relation:
return new AtomicAction<>(Relation.class, (Relation) oaf);
default:
throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind());
}
}
private static String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
return isLookUp.getResourceProfileByQuery(XQUERY);
}
}

View File

@ -5,12 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
import java.io.IOException;
import java.util.Objects;
import java.util.Optional;
import java.util.function.BiFunction;
import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
@ -68,12 +68,6 @@ public class PromoteActionPayloadForGraphTableJob {
MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
logger.info("strategy: {}", strategy);
Boolean shouldGroupById = Optional
.ofNullable(parser.get("shouldGroupById"))
.map(Boolean::valueOf)
.orElse(true);
logger.info("shouldGroupById: {}", shouldGroupById);
Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
Class<? extends Oaf> actionPayloadClazz = (Class<? extends Oaf>) Class.forName(actionPayloadClassName);
@ -95,8 +89,7 @@ public class PromoteActionPayloadForGraphTableJob {
outputGraphTablePath,
strategy,
rowClazz,
actionPayloadClazz,
shouldGroupById);
actionPayloadClazz);
});
}
@ -122,12 +115,12 @@ public class PromoteActionPayloadForGraphTableJob {
String outputGraphTablePath,
MergeAndGet.Strategy strategy,
Class<G> rowClazz,
Class<A> actionPayloadClazz, Boolean shouldGroupById) {
Class<A> actionPayloadClazz) {
Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz);
Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz);
Dataset<G> result = promoteActionPayloadForGraphTable(
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz, shouldGroupById)
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz)
.map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz));
saveGraphTable(result, outputGraphTablePath);
@ -160,9 +153,9 @@ public class PromoteActionPayloadForGraphTableJob {
private static String extractPayload(Row value) {
try {
return value.getAs("payload");
return value.<String> getAs("payload");
} catch (IllegalArgumentException | ClassCastException e) {
logger.error("cannot extract payload from action: {}", value);
logger.error("cannot extract payload from action: {}", value.toString());
throw e;
}
}
@ -181,8 +174,7 @@ public class PromoteActionPayloadForGraphTableJob {
Dataset<A> actionPayloadDS,
MergeAndGet.Strategy strategy,
Class<G> rowClazz,
Class<A> actionPayloadClazz,
Boolean shouldGroupById) {
Class<A> actionPayloadClazz) {
logger
.info(
"Promoting action payload for graph table: payload={}, table={}",
@ -194,7 +186,7 @@ public class PromoteActionPayloadForGraphTableJob {
SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy);
SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy);
SerializableSupplier<G> zeroFn = zeroFn(rowClazz);
SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget;
SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource;
Dataset<G> joinedAndMerged = PromoteActionPayloadFunctions
.joinGraphTableWithActionPayloadAndMerge(
@ -206,13 +198,9 @@ public class PromoteActionPayloadForGraphTableJob {
rowClazz,
actionPayloadClazz);
if (shouldGroupById) {
return PromoteActionPayloadFunctions
.groupGraphTableByIdAndMerge(
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
} else {
return joinedAndMerged;
}
return PromoteActionPayloadFunctions
.groupGraphTableByIdAndMerge(
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
}
private static <T extends Oaf> SerializableSupplier<T> zeroFn(Class<T> clazz) {
@ -238,13 +226,12 @@ public class PromoteActionPayloadForGraphTableJob {
}
}
private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSourceAndTarget() {
private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSource() {
return t -> {
if (isSubClass(t, Relation.class)) {
final Relation rel = (Relation) t;
return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget());
return Objects.nonNull(((Relation) t).getSource());
}
return StringUtils.isNotBlank(((OafEntity) t).getId());
return Objects.nonNull(((OafEntity) t).getId());
};
}

View File

@ -112,7 +112,6 @@ public class PromoteActionPayloadFunctions {
Class<G> rowClazz) {
TypedColumn<G, G> aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn();
return rowDS
.filter((FilterFunction<G>) o -> isNotZeroFn.get().apply(o))
.groupByKey((MapFunction<G, String>) x -> rowIdFn.get().apply(x), Encoders.STRING())
.agg(aggregator)
.map((MapFunction<Tuple2<String, G>, G>) Tuple2::_2, Encoders.kryo(rowClazz));

View File

@ -0,0 +1,56 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "is",
"paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "sn",
"paramLongName": "sourceNameNode",
"paramDescription": "nameNode of the source cluster",
"paramRequired": true
},
{
"paramName": "tn",
"paramLongName": "targetNameNode",
"paramDescription": "namoNode of the target cluster",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDirectory",
"paramDescription": "working directory",
"paramRequired": true
},
{
"paramName": "nm",
"paramLongName": "distcp_num_maps",
"paramDescription": "maximum number of map tasks used in the distcp process",
"paramRequired": true
},
{
"paramName": "mm",
"paramLongName": "distcp_memory_mb",
"paramDescription": "memory for distcp action copying actionsets from remote cluster",
"paramRequired": true
},
{
"paramName": "tt",
"paramLongName": "distcp_task_timeout",
"paramDescription": "timeout for distcp copying actions from remote cluster",
"paramRequired": true
},
{
"paramName": "tr",
"paramLongName": "transform_only",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
}
]

View File

@ -0,0 +1,20 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "is",
"paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "inputPaths",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
}
]

View File

@ -40,11 +40,5 @@
"paramLongName": "mergeAndGetStrategy",
"paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET",
"paramRequired": true
},
{
"paramName": "sgid",
"paramLongName": "shouldGroupById",
"paramDescription": "indicates whether the promotion operation should group objects in the graph by id or not",
"paramRequired": false
}
]

View File

@ -24,10 +24,6 @@
<name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -115,7 +111,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForDatasetTable"/>
<error to="Kill"/>
@ -167,7 +162,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -56,11 +56,6 @@
<name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property>
<property>
<name>shouldGroupById</name>
<value>false</value>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>

View File

@ -7,6 +7,10 @@
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>sourceNN</name>
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
@ -15,24 +19,12 @@
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
<value>/user/spark/applicationHistory</value>
</property>
</configuration>

View File

@ -0,0 +1,138 @@
<workflow-app xmlns='uri:oozie:workflow:0.5' name='migrate_actions'>
<parameters>
<property>
<name>sourceNN</name>
<description>the source name node</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the isLookup service endpoint</description>
</property>
<property>
<name>workingDirectory</name>
<description>working directory</description>
</property>
<property>
<name>distcp_memory_mb</name>
<value>6144</value>
<description>memory for distcp copying actionsets from remote cluster</description>
</property>
<property>
<name>distcp_task_timeout</name>
<value>60000000</value>
<description>timeout for distcp copying actions from remote cluster</description>
</property>
<property>
<name>distcp_num_maps</name>
<value>1</value>
<description>mmaximum number of map tasks used in the distcp process</description>
</property>
<property>
<name>transform_only</name>
<description>activate tranform-only mode. Only apply transformation step</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="migrate_actionsets"/>
<action name="migrate_actionsets">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.migration.MigrateActionSet</main-class>
<java-opt>-Dmapred.task.timeout=${distcp_task_timeout}</java-opt>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--sourceNameNode</arg><arg>${sourceNN}</arg>
<arg>--targetNameNode</arg><arg>${nameNode}</arg>
<arg>--workingDirectory</arg><arg>${workingDirectory}</arg>
<arg>--distcp_num_maps</arg><arg>${distcp_num_maps}</arg>
<arg>--distcp_memory_mb</arg><arg>${distcp_memory_mb}</arg>
<arg>--distcp_task_timeout</arg><arg>${distcp_task_timeout}</arg>
<arg>--transform_only</arg><arg>${transform_only}</arg>
<capture-output/>
</java>
<ok to="transform_actions" />
<error to="fail" />
</action>
<action name="transform_actions">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>transform_actions</name>
<class>eu.dnetlib.dhp.actionmanager.migration.TransformActions</class>
<jar>dhp-actionmanager-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--inputPaths</arg><arg>${wf:actionData('migrate_actionsets')['target_paths']}</arg>
</spark>
<ok to="end"/>
<error to="fail"/>
</action>
<kill name="fail">
<message>migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end" />
</workflow-app>

View File

@ -24,10 +24,6 @@
<name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -114,7 +110,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForOtherResearchProductTable"/>
<error to="Kill"/>
@ -166,7 +161,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/otherresearchproduct</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -24,10 +24,6 @@
<name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -115,7 +111,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/publication</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForPublicationTable"/>
<error to="Kill"/>
@ -167,7 +162,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/publication</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -24,10 +24,6 @@
<name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -114,7 +110,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/software</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForSoftwareTable"/>
<error to="Kill"/>
@ -166,7 +161,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/software</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -101,9 +101,7 @@ public class PromoteActionPayloadForGraphTableJobTest {
"-outputGraphTablePath",
"",
"-mergeAndGetStrategy",
MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name(),
"--shouldGroupById",
"true"
MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name()
}));
// then
@ -143,9 +141,7 @@ public class PromoteActionPayloadForGraphTableJobTest {
"-outputGraphTablePath",
outputGraphTableDir.toString(),
"-mergeAndGetStrategy",
strategy.name(),
"--shouldGroupById",
"true"
strategy.name()
});
// then

View File

@ -1,27 +1,29 @@
Description of the Module
--------------------------
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
of each MDStore.
This module defines a **collector worker application** that runs on Hadoop.
## Metadata collection
It is responsible for harvesting metadata using different plugins.
The **metadata collection workflow** is responsible for harvesting metadata records from different protocols and responding to
different formats and to store them as on HDFS so that they can be further processed.
The collector worker uses a message queue to inform the progress
of the harvesting action (using a message queue for sending **ONGOING** messages) furthermore,
It gives, at the end of the job, some information about the status
of the collection i.e Number of records collected(using a message queue for sending **REPORT** messages).
### Collector Plugins
To work the collection worker need some parameter like:
Different protocols are managed by dedicated Collector plugins, i.e. java programs implementing a defined interface:
* **hdfsPath**: the path where storing the sequential file
* **apidescriptor**: the JSON encoding of the API Descriptor
* **namenode**: the Name Node URI
* **userHDFS**: the user wich create the hdfs seq file
* **rabbitUser**: the user to connect with RabbitMq for messaging
* **rabbitPassWord**: the password to connect with RabbitMq for messaging
* **rabbitHost**: the host of the RabbitMq server
* **rabbitOngoingQueue**: the name of the ongoing queue
* **rabbitReportQueue**: the name of the report queue
* **workflowId**: the identifier of the dnet Workflow
```eu.dnetlib.dhp.collection.plugin.CollectorPlugin```
The list of the supported plugins:
* OAI Plugin: collects from OAI-PMH compatible endpoints
* MDStore plugin: collects from a given D-Net MetadataStore, (identified by moogodb URI, dbName, MDStoreID)
* MDStore dump plugin: collects from an MDStore dump stored on the HDFS location indicated by the `path` parameter
# Transformation Plugins
TODO
##Plugins
* OAI Plugin
## Usage
TODO

View File

@ -7,12 +7,13 @@
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-aggregation</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${net.alchim31.maven.version}</version>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
@ -37,14 +38,10 @@
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
@ -59,6 +56,19 @@
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-mapper</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
@ -79,11 +89,6 @@
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
<dependency>
<groupId>org.apache.commons</groupId>
@ -104,11 +109,8 @@
<artifactId>commons-compress</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
</dependencies>
</project>
</project>

View File

@ -75,6 +75,7 @@ public class CollectAndSave implements Serializable {
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
;
}
private static void removeOutputDir(SparkSession spark, String path) {

View File

@ -36,7 +36,7 @@ import scala.Tuple2;
*/
public class SparkAtomicActionScoreJob implements Serializable {
private static final String DOI = "doi";
private static String DOI = "doi";
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

View File

@ -1,86 +0,0 @@
package eu.dnetlib.dhp.actionmanager.datacite
import org.apache.commons.io.IOUtils
import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest}
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClients
import java.io.IOException
abstract class AbstractRestClient extends Iterator[String]{
var buffer: List[String] = List()
var current_index:Int = 0
var scroll_value: Option[String] = None
var complete:Boolean = false
def extractInfo(input: String): Unit
protected def getBufferData(): Unit
def doHTTPGETRequest(url:String): String = {
val httpGet = new HttpGet(url)
doHTTPRequest(httpGet)
}
def doHTTPPOSTRequest(url:String, json:String): String = {
val httpPost = new HttpPost(url)
if (json != null) {
val entity = new StringEntity(json)
httpPost.setEntity(entity)
httpPost.setHeader("Accept", "application/json")
httpPost.setHeader("Content-type", "application/json")
}
doHTTPRequest(httpPost)
}
def hasNext: Boolean = {
buffer.nonEmpty && current_index < buffer.size
}
override def next(): String = {
val next_item:String = buffer(current_index)
current_index = current_index + 1
if (current_index == buffer.size)
getBufferData()
next_item
}
private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={
val client = HttpClients.createDefault
var tries = 4
try {
while (tries > 0) {
println(s"requesting ${r.getURI}")
val response = client.execute(r)
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
}
else
return IOUtils.toString(response.getEntity.getContent)
}
""
} catch {
case e: Throwable =>
throw new RuntimeException("Error on executing request ", e)
} finally try client.close()
catch {
case e: IOException =>
throw new RuntimeException("Unable to close client ", e)
}
}
getBufferData()
}

View File

@ -1,31 +0,0 @@
package eu.dnetlib.dhp.actionmanager.datacite
import org.json4s.{DefaultFormats, JValue}
import org.json4s.jackson.JsonMethods.{compact, parse, render}
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
override def extractInfo(input: String): Unit = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
buffer = (json \ "data").extract[List[JValue]].map(s => compact(render(s)))
val next_url = (json \ "links" \ "next").extractOrElse[String](null)
scroll_value = if (next_url != null && next_url.nonEmpty) Some(next_url) else None
if (scroll_value.isEmpty)
complete = true
current_index = 0
}
def get_url():String ={
val to = if (until> 0) s"$until" else "*"
s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]"
}
override def getBufferData(): Unit = {
if (!complete) {
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
extractInfo(response)
}
}
}

View File

@ -1,13 +1,11 @@
package eu.dnetlib.dhp.actionmanager.datacite
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.json4s.DefaultFormats
@ -17,6 +15,7 @@ import org.json4s.jackson.JsonMethods.parse
import java.nio.charset.CodingErrorAction
import java.text.SimpleDateFormat
import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import java.util.regex.Pattern
@ -42,6 +41,18 @@ case class DateType(date: Option[String], dateType: Option[String]) {}
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
object DataciteToOAFTransformation {
val UNKNOWN_REPOSITORY_ORIGINALID = "openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18"
val DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254"
val DNET_DATACITE_DATE = "dnet:dataCite_date"
val DNET_DATACITE_TITLE = "dnet:dataCite_title"
val SYSIMPORT_ACTIONSET = "sysimport:actionset"
val DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"
val PROVENANCE_ACTION_SET_QUALIFIER: Qualifier = OafMapperUtils.qualifier(SYSIMPORT_ACTIONSET, SYSIMPORT_ACTIONSET, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS)
val MAIN_TITLE_QUALIFIER:Qualifier = OafMapperUtils.qualifier("main title","main title",DNET_DATACITE_TITLE,DNET_DATACITE_TITLE)
implicit val codec: Codec = Codec("UTF-8")
codec.onMalformedInput(CodingErrorAction.REPLACE)
@ -57,10 +68,10 @@ object DataciteToOAFTransformation {
}
val mapper = new ObjectMapper()
val unknown_repository: HostedByMapType = HostedByMapType(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
val unknown_repository: HostedByMapType = HostedByMapType(UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
val dataInfo: DataInfo = generateDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(DATACITE_ID, "Datacite")
val hostedByMap: Map[String, HostedByMapType] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
@ -164,6 +175,17 @@ object DataciteToOAFTransformation {
d
}
def fix_thai_date(input:String, format:String) :String = {
try {
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
LocalDate.from(d).toString
} catch {
case _: Throwable => ""
}
}
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
@ -228,6 +250,17 @@ object DataciteToOAFTransformation {
}
def OPEN_ACCESS_RIGHT = {
val result = new Qualifier
result.setClassid("OPEN")
result.setClassid("OPEN")
result.setSchemeid(ModelConstants.DNET_ACCESS_MODES)
result.setSchemename(ModelConstants.DNET_ACCESS_MODES)
result
}
/**
* As describe in ticket #6377
* when the result come from figshare we need to remove subject
@ -239,7 +272,7 @@ object DataciteToOAFTransformation {
if (r.getInstance() != null) {
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
r.getInstance().asScala.foreach(i => i.setAccessright(OPEN_ACCESS_RIGHT))
val l: List[StructuredProperty] = List()
r.setSubject(l.asJava)
}
@ -353,9 +386,9 @@ object DataciteToOAFTransformation {
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
if (t.titleType.isEmpty) {
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
OafMapperUtils.structuredProperty(t.title.get, MAIN_TITLE_QUALIFIER, null)
} else {
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, DNET_DATACITE_TITLE, DNET_DATACITE_TITLE, null)
}
}).asJava)
@ -377,22 +410,37 @@ object DataciteToOAFTransformation {
.map(d => d.get)
if (a_date.isDefined) {
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
if(doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
}
else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
}
}
else if (publication_year != null) {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
} else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
}
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined)
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
.map(d => (d._1.get, vocabularies.getTermAsQualifier(DNET_DATACITE_DATE, d._2.toLowerCase())))
.filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2)).asJava)
@ -436,10 +484,10 @@ object DataciteToOAFTransformation {
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
val aRights: Option[AccessRight] = accessRights.map(r => {
val aRights: Option[Qualifier] = accessRights.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
}).find(q => q != null).map(q => {
val a = new AccessRight
val a = new Qualifier
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
a.setSchemeid(q.getSchemeid)
@ -448,7 +496,7 @@ object DataciteToOAFTransformation {
})
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
if (client.isDefined) {
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
@ -456,7 +504,6 @@ object DataciteToOAFTransformation {
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier)
instance.setPid(result.getPid)
val license = accessRights
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
if (license.isDefined)
@ -470,9 +517,6 @@ object DataciteToOAFTransformation {
val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result)
result.setId(IdentifierFactory.createIdentifier(result))
if (result.getId == null)
return List()
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
}
@ -486,7 +530,7 @@ object DataciteToOAFTransformation {
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER)
di.setProvenanceaction(PROVENANCE_ACTION_SET_QUALIFIER)
di
}

View File

@ -1,5 +1,4 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.apache.hadoop.io.Text

View File

@ -1,8 +1,6 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
@ -32,8 +30,6 @@ object GenerateDataciteDatasetSpark {
.master(master)
.getOrCreate()
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
import spark.implicits._

View File

@ -1,186 +0,0 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.apache.hadoop.hdfs.DistributedFileSystem
import org.apache.hadoop.io.{IntWritable, SequenceFile, Text}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import org.apache.spark.sql.functions.max
import org.slf4j.{Logger, LoggerFactory}
import java.time.format.DateTimeFormatter._
import java.time.{LocalDate, LocalDateTime, ZoneOffset}
import scala.io.Source
object ImportDatacite {
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
def convertAPIStringToDataciteItem(input: String): DataciteType = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val doi = (json \ "attributes" \ "doi").extract[String].toLowerCase
val isActive = (json \ "attributes" \ "isActive").extract[Boolean]
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val hdfsuri = parser.get("namenode")
log.info(s"namenode is $hdfsuri")
val targetPath = parser.get("targetPath")
log.info(s"targetPath is $targetPath")
val dataciteDump = parser.get("dataciteDumpPath")
log.info(s"dataciteDump is $dataciteDump")
val hdfsTargetPath = new Path(targetPath)
log.info(s"hdfsTargetPath is $hdfsTargetPath")
val bs = if (parser.get("blocksize") == null) 100 else parser.get("blocksize").toInt
val spkipImport = parser.get("skipImport")
log.info(s"skipImport is $spkipImport")
val spark: SparkSession = SparkSession.builder()
.appName(ImportDatacite.getClass.getSimpleName)
.master(master)
.getOrCreate()
// ====== Init HDFS File System Object
val conf = new Configuration
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsuri)
// Because of Maven
conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName)
conf.set("fs.file.impl", classOf[LocalFileSystem].getName)
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("ERROR")
import spark.implicits._
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
override def zero: DataciteType = null
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
if (b == null)
return a
if (a == null)
return b
if (a.timestamp > b.timestamp) {
return a
}
b
}
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
reduce(a, b)
}
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
override def finish(reduction: DataciteType): DataciteType = reduction
}
val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
val ts = dump.select(max("timestamp")).first().getLong(0)
println(s"last Timestamp is $ts")
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
println(s"Imported from Datacite API $cnt documents")
if (cnt > 0) {
val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
.map(s => s._2.toString)
.map(s => convertAPIStringToDataciteItem(s))
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
val ds: Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType]
dump
.union(ds)
.groupByKey(_.doi)
.agg(dataciteAggregator.toColumn)
.map(s => s._2)
.repartition(4000)
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
val fs = FileSystem.get(sc.hadoopConfiguration)
fs.delete(new Path(s"$dataciteDump"), true)
fs.rename(new Path(s"${dataciteDump}_updated"), new Path(s"$dataciteDump"))
}
}
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
var from:Long = timestamp * 1000
val delta:Long = 50000000L
var client: DataciteAPIImporter = null
val now :Long =System.currentTimeMillis()
var i = 0
try {
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
try {
var start: Long = System.currentTimeMillis
while (from < now) {
client = new DataciteAPIImporter(from, bs, from + delta)
var end: Long = 0
val key: IntWritable = new IntWritable(i)
val value: Text = new Text
while (client.hasNext) {
key.set({
i += 1;
i - 1
})
value.set(client.next())
writer.append(key, value)
writer.hflush()
if (i % 1000 == 0) {
end = System.currentTimeMillis
val time = (end - start) / 1000.0F
println(s"Imported $i in $time seconds")
start = System.currentTimeMillis
}
}
println(s"updating from value: $from -> ${from+delta}")
from = from + delta
}
} catch {
case e: Throwable =>
println("Error", e)
} finally if (writer != null) writer.close()
}
catch {
case e: Throwable =>
log.error("Error", e)
}
i
}
}

View File

@ -248,7 +248,7 @@ public class PrepareProgramme {
parent = parent.substring(parent.lastIndexOf("|") + 1).trim();
}
if (current.trim().length() > parent.length()
&& current.toLowerCase().trim().startsWith(parent)) {
&& current.toLowerCase().trim().substring(0, parent.length()).equals(parent)) {
current = current.substring(parent.length() + 1);
if (current.trim().charAt(0) == '-' || current.trim().charAt(0) == '') {
current = current.trim().substring(1).trim();

View File

@ -91,7 +91,7 @@ public class PrepareProjects {
}
private static FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
return value -> {
return (FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject>) value -> {
Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
List<CSVProject> csvProjectList = new ArrayList<>();
if (csvProject.isPresent()) {

View File

@ -0,0 +1,20 @@
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
import java.util.LinkedList;
public class CollectorPluginErrorLogList extends LinkedList<String> {
private static final long serialVersionUID = -6925786561303289704L;
@Override
public String toString() {
String log = new String();
int index = 0;
for (String errorMessage : this) {
log += String.format("Retry #%s: %s / ", index++, errorMessage);
}
return log;
}
}

View File

@ -0,0 +1,20 @@
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
public class CollectorServiceException extends Exception {
private static final long serialVersionUID = 7523999812098059764L;
public CollectorServiceException(String string) {
super(string);
}
public CollectorServiceException(String string, Throwable exception) {
super(string, exception);
}
public CollectorServiceException(Throwable exception) {
super(exception);
}
}

View File

@ -0,0 +1,240 @@
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
import java.io.IOException;
import java.io.InputStream;
import java.net.*;
import java.security.GeneralSecurityException;
import java.security.cert.X509Certificate;
import java.util.List;
import java.util.Map;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* @author jochen, michele, andrea
*/
public class HttpConnector {
private static final Log log = LogFactory.getLog(HttpConnector.class);
private int maxNumberOfRetry = 6;
private int defaultDelay = 120; // seconds
private int readTimeOut = 120; // seconds
private String responseType = null;
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
public HttpConnector() {
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
}
/**
* Given the URL returns the content via HTTP GET
*
* @param requestUrl the URL
* @return the content of the downloaded resource
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
*/
public String getInputSource(final String requestUrl) throws CollectorServiceException {
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
}
/**
* Given the URL returns the content as a stream via HTTP GET
*
* @param requestUrl the URL
* @return the content of the downloaded resource as InputStream
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
*/
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
}
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber,
final CollectorPluginErrorLogList errorList)
throws CollectorServiceException {
try {
InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
try {
return IOUtils.toString(s);
} catch (IOException e) {
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
Thread.sleep(defaultDelay * 1000);
errorList.add(e.getMessage());
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
} finally {
IOUtils.closeQuietly(s);
}
} catch (InterruptedException e) {
throw new CollectorServiceException(e);
}
}
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
final CollectorPluginErrorLogList errorList)
throws CollectorServiceException {
if (retryNumber > maxNumberOfRetry) {
throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList);
}
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
try {
InputStream input = null;
try {
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
urlConn.setInstanceFollowRedirects(false);
urlConn.setReadTimeout(readTimeOut * 1000);
urlConn.addRequestProperty("User-Agent", userAgent);
if (log.isDebugEnabled()) {
logHeaderFields(urlConn);
}
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
log.warn("waiting and repeating request after " + retryAfter + " sec.");
Thread.sleep(retryAfter * 1000);
errorList.add("503 Service Unavailable");
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, errorList);
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM)
|| (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) {
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
log.debug("The requested url has been moved to " + newUrl);
errorList
.add(
String
.format(
"%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(),
newUrl));
urlConn.disconnect();
return attemptDownload(newUrl, retryNumber + 1, errorList);
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
log
.error(
String
.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
Thread.sleep(defaultDelay * 1000);
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, errorList);
} else {
input = urlConn.getInputStream();
responseType = urlConn.getContentType();
return input;
}
} catch (IOException e) {
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
Thread.sleep(defaultDelay * 1000);
errorList.add(e.getMessage());
return attemptDownload(requestUrl, retryNumber + 1, errorList);
}
} catch (InterruptedException e) {
throw new CollectorServiceException(e);
}
}
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
log.debug("StatusCode: " + urlConn.getResponseMessage());
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
if (e.getKey() != null) {
for (String v : e.getValue()) {
log.debug(" key: " + e.getKey() + " - value: " + v);
}
}
}
}
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
for (String key : headerMap.keySet()) {
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0)
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
return Integer
.parseInt(headerMap.get(key).get(0)) + 10;
}
}
return -1;
}
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
for (String key : headerMap.keySet()) {
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) {
return headerMap.get(key).get(0);
}
}
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
}
/**
* register for https scheme; this is a workaround and not intended for the use in trusted environments
*/
public void initTrustManager() {
final X509TrustManager tm = new X509TrustManager() {
@Override
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
}
@Override
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
try {
final SSLContext ctx = SSLContext.getInstance("TLS");
ctx.init(null, new TrustManager[] {
tm
}, null);
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
} catch (GeneralSecurityException e) {
log.fatal(e);
throw new IllegalStateException(e);
}
}
public int getMaxNumberOfRetry() {
return maxNumberOfRetry;
}
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
this.maxNumberOfRetry = maxNumberOfRetry;
}
public int getDefaultDelay() {
return defaultDelay;
}
public void setDefaultDelay(final int defaultDelay) {
this.defaultDelay = defaultDelay;
}
public int getReadTimeOut() {
return readTimeOut;
}
public void setReadTimeOut(final int readTimeOut) {
this.readTimeOut = readTimeOut;
}
public String getResponseType() {
return responseType;
}
}

View File

@ -17,8 +17,8 @@ import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.HttpConnector2;
/**
* Applies the parsing of a csv file and writes the Serialization of it in hdfs
@ -28,7 +28,7 @@ public class ReadCSV implements Closeable {
private final Configuration conf;
private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private final String csvFile;
private String csvFile;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -74,7 +74,7 @@ public class ReadCSV implements Closeable {
throws Exception {
this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode);
HttpConnector2 httpConnector = new HttpConnector2();
HttpConnector httpConnector = new HttpConnector();
FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
@ -85,6 +85,7 @@ public class ReadCSV implements Closeable {
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
this.csvFile = httpConnector.getInputSource(fileURL);
;
}
protected void write(final Object p) {

View File

@ -14,18 +14,19 @@ import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.HttpConnector2;
/**
* Applies the parsing of an excel file and writes the Serialization of it in hdfs
*/
public class ReadExcel implements Closeable {
private static final Log log = LogFactory.getLog(ReadCSV.class);
private final Configuration conf;
private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private final InputStream excelFile;
private InputStream excelFile;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -72,7 +73,7 @@ public class ReadExcel implements Closeable {
throws Exception {
this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode);
HttpConnector2 httpConnector = new HttpConnector2();
HttpConnector httpConnector = new HttpConnector();
FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
@ -83,6 +84,7 @@ public class ReadExcel implements Closeable {
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
this.excelFile = httpConnector.getInputSourceAsStream(fileURL);
;
}
protected void write(final Object p) {

View File

@ -3,11 +3,11 @@ package eu.dnetlib.dhp.actionmanager.ror;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
import java.io.InputStream;
import java.util.ArrayList;

View File

@ -1,45 +0,0 @@
package eu.dnetlib.dhp.aggregation.common;
import java.io.Serializable;
import org.apache.spark.util.LongAccumulator;
public class AggregationCounter implements Serializable {
private LongAccumulator totalItems;
private LongAccumulator errorItems;
private LongAccumulator processedItems;
public AggregationCounter() {
}
public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) {
this.totalItems = totalItems;
this.errorItems = errorItems;
this.processedItems = processedItems;
}
public LongAccumulator getTotalItems() {
return totalItems;
}
public void setTotalItems(LongAccumulator totalItems) {
this.totalItems = totalItems;
}
public LongAccumulator getErrorItems() {
return errorItems;
}
public void setErrorItems(LongAccumulator errorItems) {
this.errorItems = errorItems;
}
public LongAccumulator getProcessedItems() {
return processedItems;
}
public void setProcessedItems(LongAccumulator processedItems) {
this.processedItems = processedItems;
}
}

View File

@ -1,47 +0,0 @@
package eu.dnetlib.dhp.aggregation.common;
import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.dhp.message.MessageSender;
import eu.dnetlib.dhp.utils.DHPUtils;
public class AggregatorReport extends LinkedHashMap<String, String> implements Closeable {
private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class);
private MessageSender messageSender;
public AggregatorReport() {
}
public AggregatorReport(MessageSender messageSender) throws IOException {
this.messageSender = messageSender;
}
public void ongoing(Long current, Long total) {
messageSender.sendMessage(current, total);
}
@Override
public void close() throws IOException {
if (Objects.nonNull(messageSender)) {
log.info("closing report: ");
this.forEach((k, v) -> log.info("{} - {}", k, v));
Map<String, String> m = new HashMap<>();
m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values()));
messageSender.sendReport(m);
}
}
}

View File

@ -1,10 +0,0 @@
package eu.dnetlib.dhp.aggregation.common;
public interface ReporterCallback {
Long getCurrent();
Long getTotal();
}

View File

@ -1,41 +0,0 @@
package eu.dnetlib.dhp.aggregation.common;
import java.util.TimerTask;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public abstract class ReportingJob {
/**
* Frequency (seconds) for sending ongoing messages to report the collection task advancement
*/
public static final int ONGOING_REPORT_FREQUENCY = 5;
/**
* Initial delay (seconds) for sending ongoing messages to report the collection task advancement
*/
public static final int INITIAL_DELAY = 2;
private final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
protected final AggregatorReport report;
public ReportingJob(AggregatorReport report) {
this.report = report;
}
protected void schedule(final ReporterCallback callback) {
executor.scheduleAtFixedRate(new TimerTask() {
@Override
public void run() {
report.ongoing(callback.getCurrent(), callback.getTotal());
}
}, INITIAL_DELAY, ONGOING_REPORT_FREQUENCY, TimeUnit.SECONDS);
}
protected void shutdown() {
executor.shutdown();
}
}

View File

@ -1,136 +0,0 @@
package eu.dnetlib.dhp.aggregation.mdstore;
import static eu.dnetlib.dhp.common.Constants.*;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.net.URI;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.rest.DNetRestClient;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
public class MDStoreActionNode {
private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class);
enum MDAction {
NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK
}
public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion";
public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s";
public static final String ROLLBACK_VERSION_URL = "%s/version/%s/abort";
public static final String READ_LOCK_URL = "%s/mdstore/%s/startReading";
public static final String READ_UNLOCK_URL = "%s/version/%s/endReading";
private static final String MDSTOREVERSIONPARAM = "mdStoreVersion";
private static final String MDSTOREREADLOCKPARAM = "mdStoreReadLockVersion";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
IOUtils
.toString(
MDStoreActionNode.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/mdstore_action_parameters.json")));
argumentParser.parseArgument(args);
log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024));
final MDAction action = MDAction.valueOf(argumentParser.get("action"));
log.info("Current action is {}", action);
final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI");
log.info("mdStoreManagerURI is {}", mdStoreManagerURI);
switch (action) {
case NEW_VERSION: {
final String mdStoreID = argumentParser.get("mdStoreID");
if (StringUtils.isBlank(mdStoreID)) {
throw new IllegalArgumentException("missing or empty argument mdStoreId");
}
final MDStoreVersion currentVersion = DNetRestClient
.doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
populateOOZIEEnv(MDSTOREVERSIONPARAM, MAPPER.writeValueAsString(currentVersion));
break;
}
case COMMIT: {
final String hdfsuri = argumentParser.get("namenode");
if (StringUtils.isBlank(hdfsuri)) {
throw new IllegalArgumentException("missing or empty argument namenode");
}
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
if (StringUtils.isBlank(mdStoreVersion.getId())) {
throw new IllegalArgumentException(
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
}
Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + MDSTORE_SIZE_PATH);
try (
FileSystem fs = FileSystem.get(URI.create(hdfsuri), getHadoopConfiguration(hdfsuri));
FSDataInputStream inputStream = fs.open(hdfstoreSizepath)) {
final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream));
fs.create(hdfstoreSizepath);
DNetRestClient
.doGET(
String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize));
}
break;
}
case ROLLBACK: {
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
if (StringUtils.isBlank(mdStoreVersion.getId())) {
throw new IllegalArgumentException(
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
}
DNetRestClient.doGET(String.format(ROLLBACK_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId()));
break;
}
case READ_LOCK: {
final String mdStoreID = argumentParser.get("mdStoreID");
if (StringUtils.isBlank(mdStoreID)) {
throw new IllegalArgumentException("missing or empty argument mdStoreId");
}
final MDStoreVersion currentVersion = DNetRestClient
.doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
populateOOZIEEnv(MDSTOREREADLOCKPARAM, MAPPER.writeValueAsString(currentVersion));
break;
}
case READ_UNLOCK: {
final String mdStoreVersion_params = argumentParser.get("readMDStoreId");
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
if (StringUtils.isBlank(mdStoreVersion.getId())) {
throw new IllegalArgumentException(
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
}
DNetRestClient.doGET(String.format(READ_UNLOCK_URL, mdStoreManagerURI, mdStoreVersion.getId()));
break;
}
default:
throw new IllegalArgumentException("invalid action");
}
}
}

View File

@ -1,134 +0,0 @@
package eu.dnetlib.dhp.collection;
import static eu.dnetlib.dhp.common.Constants.SEQUENCE_FILE_NAME;
import java.io.IOException;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.DeflateCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
public class CollectorWorker extends ReportingJob {
private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class);
private final ApiDescriptor api;
private final FileSystem fileSystem;
private final MDStoreVersion mdStoreVersion;
private final HttpClientParams clientParams;
public CollectorWorker(
final ApiDescriptor api,
final FileSystem fileSystem,
final MDStoreVersion mdStoreVersion,
final HttpClientParams clientParams,
final AggregatorReport report) {
super(report);
this.api = api;
this.fileSystem = fileSystem;
this.mdStoreVersion = mdStoreVersion;
this.clientParams = clientParams;
}
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
log.info("outputPath path is {}", outputPath);
final CollectorPlugin plugin = getCollectorPlugin();
final AtomicInteger counter = new AtomicInteger(0);
scheduleReport(counter);
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
fileSystem.getConf(),
SequenceFile.Writer.file(new Path(outputPath)),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final IntWritable key = new IntWritable(counter.get());
final Text value = new Text();
plugin
.collect(api, report)
.forEach(
content -> {
key.set(counter.getAndIncrement());
value.set(content);
try {
writer.append(key, value);
} catch (Throwable e) {
throw new RuntimeException(e);
}
});
} catch (Throwable e) {
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e);
} finally {
shutdown();
report.ongoing(counter.longValue(), counter.longValue());
}
}
private void scheduleReport(AtomicInteger counter) {
schedule(new ReporterCallback() {
@Override
public Long getCurrent() {
return counter.longValue();
}
@Override
public Long getTotal() {
return null;
}
});
}
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
case oai:
return new OaiCollectorPlugin(clientParams);
case rest_json2xml:
return new RestCollectorPlugin(clientParams);
case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(api.getParams().get("other_plugin_type"))
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
.get();
switch (plugin) {
case mdstore_mongodb_dump:
return new MongoDbDumpCollectorPlugin(fileSystem);
case mdstore_mongodb:
return new MDStoreCollectorPlugin();
default:
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
}
default:
throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
}
}
}

View File

@ -1,135 +0,0 @@
package eu.dnetlib.dhp.collection;
import static eu.dnetlib.dhp.common.Constants.*;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.io.IOException;
import java.util.Optional;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.message.MessageSender;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
/**
* CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes
* into HDFS. This application will be executed on the hadoop cluster, where invoked in the context of the metadata collection
* oozie workflow, it will receive all the input parameters necessary to instantiate the specific collection plugin and the
* relative specific configurations
*
* @author Sandro La Bruzzo, Claudio Atzori
*/
public class CollectorWorkerApplication {
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
private final FileSystem fileSystem;
public CollectorWorkerApplication(FileSystem fileSystem) {
this.fileSystem = fileSystem;
}
/**
* @param args
*/
public static void main(final String[] args)
throws ParseException, IOException, UnknownCollectorPluginException, CollectorException {
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
IOUtils
.toString(
CollectorWorkerApplication.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json")));
argumentParser.parseArgument(args);
log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024));
final String hdfsuri = argumentParser.get("namenode");
log.info("hdfsURI is {}", hdfsuri);
final String apiDescriptor = argumentParser.get("apidescriptor");
log.info("apiDescriptor is {}", apiDescriptor);
final String mdStoreVersion = argumentParser.get("mdStoreVersion");
log.info("mdStoreVersion is {}", mdStoreVersion);
final String dnetMessageManagerURL = argumentParser.get(DNET_MESSAGE_MGR_URL);
log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL);
final String workflowId = argumentParser.get("workflowId");
log.info("workflowId is {}", workflowId);
final HttpClientParams clientParams = getClientParams(argumentParser);
final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class);
final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsuri));
new CollectorWorkerApplication(fileSystem)
.run(mdStoreVersion, clientParams, api, dnetMessageManagerURL, workflowId);
}
protected void run(String mdStoreVersion, HttpClientParams clientParams, ApiDescriptor api,
String dnetMessageManagerURL, String workflowId)
throws IOException, CollectorException, UnknownCollectorPluginException {
final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
final MessageSender ms = new MessageSender(dnetMessageManagerURL, workflowId);
try (AggregatorReport report = new AggregatorReport(ms)) {
new CollectorWorker(api, fileSystem, currentVersion, clientParams, report).collect();
}
}
private static HttpClientParams getClientParams(ArgumentApplicationParser argumentParser) {
final HttpClientParams clientParams = new HttpClientParams();
clientParams
.setMaxNumberOfRetry(
Optional
.ofNullable(argumentParser.get(MAX_NUMBER_OF_RETRY))
.map(Integer::parseInt)
.orElse(HttpClientParams._maxNumberOfRetry));
log.info("maxNumberOfRetry is {}", clientParams.getMaxNumberOfRetry());
clientParams
.setRequestDelay(
Optional
.ofNullable(argumentParser.get(REQUEST_DELAY))
.map(Integer::parseInt)
.orElse(HttpClientParams._requestDelay));
log.info("requestDelay is {}", clientParams.getRequestDelay());
clientParams
.setRetryDelay(
Optional
.ofNullable(argumentParser.get(RETRY_DELAY))
.map(Integer::parseInt)
.orElse(HttpClientParams._retryDelay));
log.info("retryDelay is {}", clientParams.getRetryDelay());
clientParams
.setConnectTimeOut(
Optional
.ofNullable(argumentParser.get(CONNECT_TIMEOUT))
.map(Integer::parseInt)
.orElse(HttpClientParams._connectTimeOut));
log.info("connectTimeOut is {}", clientParams.getConnectTimeOut());
clientParams
.setReadTimeOut(
Optional
.ofNullable(argumentParser.get(READ_TIMEOUT))
.map(Integer::parseInt)
.orElse(HttpClientParams._readTimeOut));
log.info("readTimeOut is {}", clientParams.getReadTimeOut());
return clientParams;
}
}

View File

@ -1,26 +1,28 @@
package eu.dnetlib.dhp.collection;
import static eu.dnetlib.dhp.common.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.cli.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.Aggregator;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document;
import org.dom4j.Node;
@ -28,172 +30,19 @@ import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
import eu.dnetlib.dhp.schema.mdstore.Provenance;
import scala.Tuple2;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
import eu.dnetlib.message.MessageType;
public class GenerateNativeStoreSparkJob {
private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class);
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenerateNativeStoreSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/generate_native_input_parameters.json")));
parser.parseArgument(args);
final String provenanceArgument = parser.get("provenance");
log.info("Provenance is {}", provenanceArgument);
final Provenance provenance = MAPPER.readValue(provenanceArgument, Provenance.class);
final String dateOfCollectionArgs = parser.get("dateOfCollection");
log.info("dateOfCollection is {}", dateOfCollectionArgs);
final Long dateOfCollection = new Long(dateOfCollectionArgs);
String mdStoreVersion = parser.get("mdStoreVersion");
log.info("mdStoreVersion is {}", mdStoreVersion);
final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
String readMdStoreVersionParam = parser.get("readMdStoreVersion");
log.info("readMdStoreVersion is {}", readMdStoreVersionParam);
final MDStoreVersion readMdStoreVersion = StringUtils.isBlank(readMdStoreVersionParam) ? null
: MAPPER.readValue(readMdStoreVersionParam, MDStoreVersion.class);
final String xpath = parser.get("xpath");
log.info("xpath is {}", xpath);
final String encoding = parser.get("encoding");
log.info("encoding is {}", encoding);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> createNativeMDStore(
spark, provenance, dateOfCollection, xpath, encoding, currentVersion, readMdStoreVersion));
}
private static void createNativeMDStore(SparkSession spark,
Provenance provenance,
Long dateOfCollection,
String xpath,
String encoding,
MDStoreVersion currentVersion,
MDStoreVersion readVersion) throws IOException {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final LongAccumulator totalItems = sc.sc().longAccumulator(CONTENT_TOTALITEMS);
final LongAccumulator invalidRecords = sc.sc().longAccumulator(CONTENT_INVALIDRECORDS);
final String seqFilePath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
final JavaRDD<MetadataRecord> nativeStore = sc
.sequenceFile(seqFilePath, IntWritable.class, Text.class)
.map(
item -> parseRecord(
item._2().toString(),
xpath,
encoding,
provenance,
dateOfCollection,
totalItems,
invalidRecords))
.filter(Objects::nonNull)
.distinct();
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
final Dataset<MetadataRecord> mdstore = spark.createDataset(nativeStore.rdd(), encoder);
final String targetPath = currentVersion.getHdfsPath() + MDSTORE_DATA_PATH;
if (readVersion != null) { // INCREMENTAL MODE
log.info("updating {} incrementally with {}", targetPath, readVersion.getHdfsPath());
Dataset<MetadataRecord> currentMdStoreVersion = spark
.read()
.load(readVersion.getHdfsPath() + MDSTORE_DATA_PATH)
.as(encoder);
TypedColumn<MetadataRecord, MetadataRecord> aggregator = new MDStoreAggregator().toColumn();
final Dataset<MetadataRecord> map = currentMdStoreVersion
.union(mdstore)
.groupByKey(
(MapFunction<MetadataRecord, String>) MetadataRecord::getId,
Encoders.STRING())
.agg(aggregator)
.map((MapFunction<Tuple2<String, MetadataRecord>, MetadataRecord>) Tuple2::_2, encoder);
map.select("id").takeAsList(100).forEach(s -> log.info(s.toString()));
saveDataset(map, targetPath);
} else {
saveDataset(mdstore, targetPath);
}
final Long total = spark.read().load(targetPath).count();
log.info("collected {} records for datasource '{}'", total, provenance.getDatasourceName());
writeHdfsFile(
spark.sparkContext().hadoopConfiguration(), total.toString(),
currentVersion.getHdfsPath() + MDSTORE_SIZE_PATH);
}
public static class MDStoreAggregator extends Aggregator<MetadataRecord, MetadataRecord, MetadataRecord> {
@Override
public MetadataRecord zero() {
return null;
}
@Override
public MetadataRecord reduce(MetadataRecord b, MetadataRecord a) {
return getLatestRecord(b, a);
}
@Override
public MetadataRecord merge(MetadataRecord b, MetadataRecord a) {
return getLatestRecord(b, a);
}
private MetadataRecord getLatestRecord(MetadataRecord b, MetadataRecord a) {
if (b == null)
return a;
if (a == null)
return b;
return (a.getDateOfCollection() > b.getDateOfCollection()) ? a : b;
}
@Override
public MetadataRecord finish(MetadataRecord r) {
return r;
}
@Override
public Encoder<MetadataRecord> bufferEncoder() {
return Encoders.bean(MetadataRecord.class);
}
@Override
public Encoder<MetadataRecord> outputEncoder() {
return Encoders.bean(MetadataRecord.class);
}
}
public static MetadataRecord parseRecord(
final String input,
final String xpath,
@ -215,11 +64,112 @@ public class GenerateNativeStoreSparkJob {
invalidRecords.add(1);
return null;
}
return new MetadataRecord(originalIdentifier, encoding, provenance, document.asXML(), dateOfCollection);
return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection);
} catch (Throwable e) {
invalidRecords.add(1);
if (invalidRecords != null)
invalidRecords.add(1);
e.printStackTrace();
return null;
}
}
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenerateNativeStoreSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/collection_input_parameters.json")));
parser.parseArgument(args);
final ObjectMapper jsonMapper = new ObjectMapper();
final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class);
final long dateOfCollection = new Long(parser.get("dateOfCollection"));
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final Map<String, String> ongoingMap = new HashMap<>();
final Map<String, String> reportMap = new HashMap<>();
final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaPairRDD<IntWritable, Text> inputRDD = sc
.sequenceFile(parser.get("input"), IntWritable.class, Text.class);
final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems");
final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords");
final MessageManager manager = new MessageManager(
parser.get("rabbitHost"),
parser.get("rabbitUser"),
parser.get("rabbitPassword"),
false,
false,
null);
final JavaRDD<MetadataRecord> mappeRDD = inputRDD
.map(
item -> parseRecord(
item._2().toString(),
parser.get("xpath"),
parser.get("encoding"),
provenance,
dateOfCollection,
totalItems,
invalidRecords))
.filter(Objects::nonNull)
.distinct();
ongoingMap.put("ongoing", "0");
if (!test) {
manager
.sendMessage(
new Message(
parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap),
parser.get("rabbitOngoingQueue"),
true,
false);
}
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
final Dataset<MetadataRecord> mdstore = spark.createDataset(mappeRDD.rdd(), encoder);
final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords");
mdStoreRecords.add(mdstore.count());
ongoingMap.put("ongoing", "" + totalItems.value());
if (!test) {
manager
.sendMessage(
new Message(
parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap),
parser.get("rabbitOngoingQueue"),
true,
false);
}
mdstore.write().format("parquet").save(parser.get("output"));
reportMap.put("inputItem", "" + totalItems.value());
reportMap.put("invalidRecords", "" + invalidRecords.value());
reportMap.put("mdStoreSize", "" + mdStoreRecords.value());
if (!test) {
manager
.sendMessage(
new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
parser.get("rabbitReportQueue"),
true,
false);
manager.close();
}
});
}
}

View File

@ -1,94 +0,0 @@
package eu.dnetlib.dhp.collection;
/**
* Bundles the http connection parameters driving the client behaviour.
*/
public class HttpClientParams {
// Defaults
public static int _maxNumberOfRetry = 3;
public static int _requestDelay = 0; // milliseconds
public static int _retryDelay = 10; // seconds
public static int _connectTimeOut = 10; // seconds
public static int _readTimeOut = 30; // seconds
/**
* Maximum number of allowed retires before failing
*/
private int maxNumberOfRetry;
/**
* Delay between request (Milliseconds)
*/
private int requestDelay;
/**
* Time to wait after a failure before retrying (Seconds)
*/
private int retryDelay;
/**
* Connect timeout (Seconds)
*/
private int connectTimeOut;
/**
* Read timeout (Seconds)
*/
private int readTimeOut;
public HttpClientParams() {
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut);
}
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
int readTimeOut) {
this.maxNumberOfRetry = maxNumberOfRetry;
this.requestDelay = requestDelay;
this.retryDelay = retryDelay;
this.connectTimeOut = connectTimeOut;
this.readTimeOut = readTimeOut;
}
public int getMaxNumberOfRetry() {
return maxNumberOfRetry;
}
public void setMaxNumberOfRetry(int maxNumberOfRetry) {
this.maxNumberOfRetry = maxNumberOfRetry;
}
public int getRequestDelay() {
return requestDelay;
}
public void setRequestDelay(int requestDelay) {
this.requestDelay = requestDelay;
}
public int getRetryDelay() {
return retryDelay;
}
public void setRetryDelay(int retryDelay) {
this.retryDelay = retryDelay;
}
public void setConnectTimeOut(int connectTimeOut) {
this.connectTimeOut = connectTimeOut;
}
public int getConnectTimeOut() {
return connectTimeOut;
}
public int getReadTimeOut() {
return readTimeOut;
}
public void setReadTimeOut(int readTimeOut) {
this.readTimeOut = readTimeOut;
}
}

View File

@ -1,259 +0,0 @@
package eu.dnetlib.dhp.collection;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.io.IOException;
import java.io.InputStream;
import java.net.*;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.http.HttpHeaders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
/**
* Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java
*
* @author jochen, michele, andrea, alessia, claudio
*/
public class HttpConnector2 {
private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class);
private static final String REPORT_PREFIX = "http:";
private HttpClientParams clientParams;
private String responseType = null;
private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
public HttpConnector2() {
this(new HttpClientParams());
}
public HttpConnector2(HttpClientParams clientParams) {
this.clientParams = clientParams;
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
}
/**
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
*/
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException {
return IOUtils.toInputStream(getInputSource(requestUrl));
}
/**
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
*/
public String getInputSource(final String requestUrl) throws CollectorException {
return attemptDownloadAsString(requestUrl, 1, new AggregatorReport());
}
/**
* Given the URL returns the content via HTTP GET
*
* @param requestUrl the URL
* @param report the list of errors
* @return the content of the downloaded resource
* @throws CollectorException when retrying more than maxNumberOfRetry times
*/
public String getInputSource(final String requestUrl, AggregatorReport report)
throws CollectorException {
return attemptDownloadAsString(requestUrl, 1, report);
}
private String attemptDownloadAsString(final String requestUrl, final int retryNumber,
final AggregatorReport report) throws CollectorException {
try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) {
return IOUtils.toString(s);
} catch (IOException e) {
log.error(e.getMessage(), e);
throw new CollectorException(e);
}
}
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
final AggregatorReport report) throws CollectorException, IOException {
if (retryNumber > getClientParams().getMaxNumberOfRetry()) {
final String msg = String
.format(
"Max number of retries (%s/%s) exceeded, failing.",
retryNumber, getClientParams().getMaxNumberOfRetry());
log.error(msg);
throw new CollectorException(msg);
}
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
InputStream input = null;
try {
if (getClientParams().getRequestDelay() > 0) {
backoffAndSleep(getClientParams().getRequestDelay());
}
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
urlConn.setInstanceFollowRedirects(false);
urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000);
urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000);
urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent);
if (log.isDebugEnabled()) {
logHeaderFields(urlConn);
}
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
if (is2xx(urlConn.getResponseCode())) {
input = urlConn.getInputStream();
responseType = urlConn.getContentType();
return input;
}
if (is3xx(urlConn.getResponseCode())) {
// REDIRECTS
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
log.info(String.format("The requested url has been moved to %s", newUrl));
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String.format("Moved to: %s", newUrl));
urlConn.disconnect();
if (retryAfter > 0) {
backoffAndSleep(retryAfter);
}
return attemptDownload(newUrl, retryNumber + 1, report);
}
if (is4xx(urlConn.getResponseCode())) {
// CLIENT ERROR, DO NOT RETRY
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String
.format(
"%s error: %s", requestUrl, urlConn.getResponseMessage()));
throw new CollectorException("4xx error: request will not be repeated. " + report);
}
if (is5xx(urlConn.getResponseCode())) {
// SERVER SIDE ERRORS RETRY ONLY on 503
switch (urlConn.getResponseCode()) {
case HttpURLConnection.HTTP_UNAVAILABLE:
if (retryAfter > 0) {
log
.warn(
requestUrl + " - waiting and repeating request after suggested retry-after "
+ retryAfter + " sec.");
backoffAndSleep(retryAfter * 1000);
} else {
log
.warn(
requestUrl + " - waiting and repeating request after default delay of "
+ getClientParams().getRetryDelay() + " sec.");
backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000);
}
report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl);
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, report);
default:
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String
.format(
"%s Error: %s", requestUrl, urlConn.getResponseMessage()));
throw new CollectorException(urlConn.getResponseCode() + " error " + report);
}
}
throw new CollectorException(
String
.format(
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
MAPPER.writeValueAsString(report)));
} catch (MalformedURLException | UnknownHostException e) {
log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e.getMessage(), e);
} catch (SocketTimeoutException | SocketException e) {
log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage());
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
return attemptDownload(requestUrl, retryNumber + 1, report);
}
}
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
log.debug("StatusCode: " + urlConn.getResponseMessage());
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
if (e.getKey() != null) {
for (String v : e.getValue()) {
log.debug(" key: " + e.getKey() + " - value: " + v);
}
}
}
}
private void backoffAndSleep(int sleepTimeMs) throws CollectorException {
log.info("I'm going to sleep for {}ms", sleepTimeMs);
try {
Thread.sleep(sleepTimeMs);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
throw new CollectorException(e);
}
}
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
for (String key : headerMap.keySet()) {
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (headerMap.get(key).size() > 0)
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
}
}
return -1;
}
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorException {
for (String key : headerMap.keySet()) {
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) {
return headerMap.get(key).get(0);
}
}
throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING");
}
private boolean is2xx(final int statusCode) {
return statusCode >= 200 && statusCode <= 299;
}
private boolean is4xx(final int statusCode) {
return statusCode >= 400 && statusCode <= 499;
}
private boolean is3xx(final int statusCode) {
return statusCode >= 300 && statusCode <= 399;
}
private boolean is5xx(final int statusCode) {
return statusCode >= 500 && statusCode <= 599;
}
public String getResponseType() {
return responseType;
}
public HttpClientParams getClientParams() {
return clientParams;
}
public void setClientParams(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
}

View File

@ -1,84 +0,0 @@
package eu.dnetlib.dhp.collection;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class JsonUtils {
private static final Log log = LogFactory.getLog(JsonUtils.class);
public static final String wrapName = "recordWrap";
/**
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
* and work-around for the JSON to XML converting of org.json.XML-package.
*
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"],
*
* @param jsonInput
* @return convertedJsonKeynameOutput
*/
public String syntaxConvertJsonKeyNames(String jsonInput) {
log.trace("before convertJsonKeyNames: " + jsonInput);
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
// replace ' 's in JSON Namens with '_'
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
}
// replace forward-slash (sign '/' ) in JSON Names with '_'
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
}
// replace '(' in JSON Names with ''
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
}
// replace ')' in JSON Names with ''
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
}
// add prefix of startNumbers in JSON Keynames with 'n_'
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
}
// add prefix of only numbers in JSON Keynames with 'm_'
while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
}
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
}
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
// }
// replace '=' in JSON Keynames with '-'
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
}
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
return jsonInput;
}
public String convertToXML(final String jsonRecord) {
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord));
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
log.trace("before inputStream: " + resultXml);
resultXml = XmlCleaner.cleanAllEntities(resultXml);
log.trace("after cleaning: " + resultXml);
return resultXml;
}
}

View File

@ -1,32 +0,0 @@
package eu.dnetlib.dhp.collection;
public class UnknownCollectorPluginException extends Exception {
/** */
private static final long serialVersionUID = -290723075076039757L;
public UnknownCollectorPluginException() {
super();
}
public UnknownCollectorPluginException(
final String message,
final Throwable cause,
final boolean enableSuppression,
final boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
public UnknownCollectorPluginException(final String message, final Throwable cause) {
super(message, cause);
}
public UnknownCollectorPluginException(final String message) {
super(message);
}
public UnknownCollectorPluginException(final Throwable cause) {
super(cause);
}
}

View File

@ -3,21 +3,10 @@ package eu.dnetlib.dhp.collection.plugin;
import java.util.stream.Stream;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
public interface CollectorPlugin {
enum NAME {
oai, other, rest_json2xml;
public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb
}
}
Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException;
Stream<String> collect(ApiDescriptor api) throws DnetCollectorException;
}

View File

@ -1,60 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.mongodb;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.client.MongoCollection;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.MdstoreClient;
public class MDStoreCollectorPlugin implements CollectorPlugin {
private static final Logger log = LoggerFactory.getLogger(MDStoreCollectorPlugin.class);
public static final String MONGODB_DBNAME = "mongodb_dbname";
public static final String MDSTORE_ID = "mdstore_id";
@Override
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
final String mongoBaseUrl = Optional
.ofNullable(api.getBaseUrl())
.orElseThrow(
() -> new CollectorException(
"missing mongodb baseUrl, expected in eu.dnetlib.dhp.collection.ApiDescriptor.baseUrl"));
log.info("mongoBaseUrl: {}", mongoBaseUrl);
final String dbName = Optional
.ofNullable(api.getParams().get(MONGODB_DBNAME))
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_DBNAME)));
log.info("dbName: {}", dbName);
final String mdId = Optional
.ofNullable(api.getParams().get(MDSTORE_ID))
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MDSTORE_ID)));
log.info("mdId: {}", mdId);
final MdstoreClient client = new MdstoreClient(mongoBaseUrl, dbName);
final MongoCollection<Document> mdstore = client.mdStore(mdId);
long size = mdstore.count();
return StreamSupport
.stream(
Spliterators.spliterator(mdstore.find().iterator(), size, Spliterator.SIZED), false)
.map(doc -> doc.getString("body"));
}
}

View File

@ -1,54 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.mongodb;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Optional;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.utils.DHPUtils;
public class MongoDbDumpCollectorPlugin implements CollectorPlugin {
public static final String PATH_PARAM = "path";
public static final String BODY_JSONPATH = "$.body";
public FileSystem fileSystem;
public MongoDbDumpCollectorPlugin(FileSystem fileSystem) {
this.fileSystem = fileSystem;
}
@Override
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
final Path path = Optional
.ofNullable(api.getParams().get("path"))
.map(Path::new)
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", PATH_PARAM)));
try {
if (!fileSystem.exists(path)) {
throw new CollectorException("path does not exist: " + path.toString());
}
return new BufferedReader(
new InputStreamReader(new GZIPInputStream(fileSystem.open(path)), Charset.defaultCharset()))
.lines()
.map(s -> DHPUtils.getJPathString(BODY_JSONPATH, s));
} catch (IOException e) {
throw new CollectorException(e);
}
}
}

View File

@ -13,11 +13,9 @@ import com.google.common.base.Splitter;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.HttpClientParams;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
public class OaiCollectorPlugin implements CollectorPlugin {
@ -28,15 +26,8 @@ public class OaiCollectorPlugin implements CollectorPlugin {
private OaiIteratorFactory oaiIteratorFactory;
private HttpClientParams clientParams;
public OaiCollectorPlugin(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report)
throws CollectorException {
public Stream<String> collect(final ApiDescriptor api) throws DnetCollectorException {
final String baseUrl = api.getBaseUrl();
final String mdFormat = api.getParams().get(FORMAT_PARAM);
final String setParam = api.getParams().get(OAI_SET_PARAM);
@ -55,26 +46,26 @@ public class OaiCollectorPlugin implements CollectorPlugin {
}
if (baseUrl == null || baseUrl.isEmpty()) {
throw new CollectorException("Param 'baseurl' is null or empty");
throw new DnetCollectorException("Param 'baseurl' is null or empty");
}
if (mdFormat == null || mdFormat.isEmpty()) {
throw new CollectorException("Param 'mdFormat' is null or empty");
throw new DnetCollectorException("Param 'mdFormat' is null or empty");
}
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
}
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
}
final Iterator<Iterator<String>> iters = sets
.stream()
.map(
set -> getOaiIteratorFactory()
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate, getClientParams(), report))
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate))
.iterator();
return StreamSupport
@ -88,12 +79,4 @@ public class OaiCollectorPlugin implements CollectorPlugin {
}
return oaiIteratorFactory;
}
public HttpClientParams getClientParams() {
return clientParams;
}
public void setClientParams(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
}

View File

@ -1,9 +1,7 @@
package eu.dnetlib.dhp.collection.plugin.oai;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Iterator;
@ -11,28 +9,24 @@ import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.SAXReader;
import org.dom4j.io.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.HttpConnector2;
import eu.dnetlib.dhp.collection.XmlCleaner;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner;
public class OaiIterator implements Iterator<String> {
private static final Logger log = LoggerFactory.getLogger(OaiIterator.class);
private final static String REPORT_PREFIX = "oai:";
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on
// 11/24/08 5:02 PM
private final Queue<String> queue = new PriorityBlockingQueue<>();
private final SAXReader reader = new SAXReader();
private final String baseUrl;
private final String set;
@ -41,8 +35,7 @@ public class OaiIterator implements Iterator<String> {
private final String untilDate;
private String token;
private boolean started;
private final HttpConnector2 httpConnector;
private final AggregatorReport report;
private final HttpConnector httpConnector;
public OaiIterator(
final String baseUrl,
@ -50,8 +43,7 @@ public class OaiIterator implements Iterator<String> {
final String set,
final String fromDate,
final String untilDate,
final HttpConnector2 httpConnector,
final AggregatorReport report) {
final HttpConnector httpConnector) {
this.baseUrl = baseUrl;
this.mdFormat = mdFormat;
this.set = set;
@ -59,7 +51,6 @@ public class OaiIterator implements Iterator<String> {
this.untilDate = untilDate;
this.started = false;
this.httpConnector = httpConnector;
this.report = report;
}
private void verifyStarted() {
@ -67,7 +58,7 @@ public class OaiIterator implements Iterator<String> {
this.started = true;
try {
this.token = firstPage();
} catch (final CollectorException e) {
} catch (final DnetCollectorException e) {
throw new RuntimeException(e);
}
}
@ -89,7 +80,7 @@ public class OaiIterator implements Iterator<String> {
while (queue.isEmpty() && token != null && !token.isEmpty()) {
try {
token = otherPages(token);
} catch (final CollectorException e) {
} catch (final DnetCollectorException e) {
throw new RuntimeException(e);
}
}
@ -101,7 +92,7 @@ public class OaiIterator implements Iterator<String> {
public void remove() {
}
private String firstPage() throws CollectorException {
private String firstPage() throws DnetCollectorException {
try {
String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8");
if (set != null && !set.isEmpty()) {
@ -117,8 +108,7 @@ public class OaiIterator implements Iterator<String> {
return downloadPage(url);
} catch (final UnsupportedEncodingException e) {
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e);
throw new DnetCollectorException(e);
}
}
@ -136,35 +126,32 @@ public class OaiIterator implements Iterator<String> {
return result.trim();
}
private String otherPages(final String resumptionToken) throws CollectorException {
private String otherPages(final String resumptionToken) throws DnetCollectorException {
try {
return downloadPage(
baseUrl
+ "?verb=ListRecords&resumptionToken="
+ URLEncoder.encode(resumptionToken, "UTF-8"));
} catch (final UnsupportedEncodingException e) {
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e);
throw new DnetCollectorException(e);
}
}
private String downloadPage(final String url) throws CollectorException {
private String downloadPage(final String url) throws DnetCollectorException {
final String xml = httpConnector.getInputSource(url, report);
final String xml = httpConnector.getInputSource(url);
Document doc;
try {
doc = DocumentHelper.parseText(xml);
doc = reader.read(new StringReader(xml));
} catch (final DocumentException e) {
log.warn("Error parsing xml, I try to clean it. {}", e.getMessage());
report.put(e.getClass().getName(), e.getMessage());
log.warn("Error parsing xml, I try to clean it: " + xml, e);
final String cleaned = XmlCleaner.cleanAllEntities(xml);
try {
doc = DocumentHelper.parseText(xml);
doc = reader.read(new StringReader(cleaned));
} catch (final DocumentException e1) {
final String resumptionToken = extractResumptionToken(xml);
if (resumptionToken == null) {
report.put(e1.getClass().getName(), e1.getMessage());
throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1);
throw new DnetCollectorException("Error parsing cleaned document:" + cleaned, e1);
}
return resumptionToken;
}
@ -172,35 +159,19 @@ public class OaiIterator implements Iterator<String> {
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
if (errorNode != null) {
final String code = errorNode.valueOf("@code").trim();
if ("noRecordsMatch".equalsIgnoreCase(code)) {
final String msg = "noRecordsMatch for oai call : " + url;
log.warn(msg);
report.put(REPORT_PREFIX + code, msg);
final String code = errorNode.valueOf("@code");
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
log.warn("noRecordsMatch for oai call: " + url);
return null;
} else {
final String msg = code + " - " + errorNode.getText();
report.put(REPORT_PREFIX + "error", msg);
throw new CollectorException(msg);
throw new DnetCollectorException(code + " - " + errorNode.getText());
}
}
for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
final StringWriter sw = new StringWriter();
final XMLWriter writer = new XMLWriter(sw, OutputFormat.createPrettyPrint());
try {
writer.write((Node) o);
queue.add(sw.toString());
} catch (IOException e) {
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException("Error parsing XML record:\n" + ((Node) o).asXML(), e);
}
queue.add(((Node) o).asXML());
}
return doc.valueOf("//*[local-name()='resumptionToken']");
}
public AggregatorReport getReport() {
return report;
}
}

View File

@ -3,28 +3,24 @@ package eu.dnetlib.dhp.collection.plugin.oai;
import java.util.Iterator;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.HttpClientParams;
import eu.dnetlib.dhp.collection.HttpConnector2;
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
public class OaiIteratorFactory {
private HttpConnector2 httpConnector;
private HttpConnector httpConnector;
public Iterator<String> newIterator(
final String baseUrl,
final String mdFormat,
final String set,
final String fromDate,
final String untilDate,
final HttpClientParams clientParams,
final AggregatorReport report) {
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(clientParams), report);
final String untilDate) {
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector());
}
private HttpConnector2 getHttpConnector(HttpClientParams clientParams) {
private HttpConnector getHttpConnector() {
if (httpConnector == null)
httpConnector = new HttpConnector2(clientParams);
httpConnector = new HttpConnector();
return httpConnector;
}
}

View File

@ -1,105 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.rest;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.HttpClientParams;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
/**
* TODO: delegate HTTP requests to the common HttpConnector2 implementation.
*
* @author js, Andreas Czerniak
* @date 2020-04-09
*
*/
public class RestCollectorPlugin implements CollectorPlugin {
public static final String RESULT_SIZE_VALUE_DEFAULT = "100";
private final HttpClientParams clientParams;
public RestCollectorPlugin(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
final String baseUrl = api.getBaseUrl();
final String resumptionType = api.getParams().get("resumptionType");
final String resumptionParam = api.getParams().get("resumptionParam");
final String resumptionXpath = api.getParams().get("resumptionXpath");
final String resultTotalXpath = api.getParams().get("resultTotalXpath");
final String resultFormatParam = api.getParams().get("resultFormatParam");
final String resultFormatValue = api.getParams().get("resultFormatValue");
final String resultSizeParam = api.getParams().get("resultSizeParam");
final String queryParams = api.getParams().get("queryParams");
final String entityXpath = api.getParams().get("entityXpath");
final String authMethod = api.getParams().get("authMethod");
final String authToken = api.getParams().get("authToken");
final String resultSizeValue = Optional
.ofNullable(api.getParams().get("resultSizeValue"))
.filter(StringUtils::isNotBlank)
.orElse(RESULT_SIZE_VALUE_DEFAULT);
if (StringUtils.isBlank(baseUrl)) {
throw new CollectorException("Param 'baseUrl' is null or empty");
}
if (StringUtils.isBlank(resumptionType)) {
throw new CollectorException("Param 'resumptionType' is null or empty");
}
if (StringUtils.isBlank(resumptionParam)) {
throw new CollectorException("Param 'resumptionParam' is null or empty");
}
if (StringUtils.isBlank(resultFormatValue)) {
throw new CollectorException("Param 'resultFormatValue' is null or empty");
}
if (StringUtils.isBlank(queryParams)) {
throw new CollectorException("Param 'queryParams' is null or empty");
}
if (StringUtils.isBlank(entityXpath)) {
throw new CollectorException("Param 'entityXpath' is null or empty");
}
final String resultOutputFormat = Optional
.ofNullable(api.getParams().get("resultOutputFormat"))
.map(String::toLowerCase)
.filter(StringUtils::isNotBlank)
.orElse(resultFormatValue.toLowerCase());
RestIterator it = new RestIterator(
getClientParams(),
baseUrl,
resumptionType,
resumptionParam,
resumptionXpath,
resultTotalXpath,
resultFormatParam,
resultFormatValue,
resultSizeParam,
resultSizeValue,
queryParams,
entityXpath,
authMethod,
authToken,
resultOutputFormat);
return StreamSupport
.stream(
Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
}
public HttpClientParams getClientParams() {
return clientParams;
}
}

View File

@ -1,411 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.rest;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHeaders;
import org.apache.http.entity.ContentType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.HttpClientParams;
import eu.dnetlib.dhp.collection.JsonUtils;
/**
* log.info(...) equal to log.trace(...) in the application-logs
* <p>
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
*
* @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
* @date 2020-04-09
*
*/
public class RestIterator implements Iterator<String> {
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
public static final String UTF_8 = "UTF-8";
private final HttpClientParams clientParams;
private final String BASIC = "basic";
private final JsonUtils jsonUtils;
private final String baseUrl;
private final String resumptionType;
private final String resumptionParam;
private final String resultFormatValue;
private String queryParams;
private final int resultSizeValue;
private int resumptionInt = 0; // integer resumption token (first record to harvest)
private int resultTotal = -1;
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
// or token scanned from results)
private InputStream resultStream;
private Transformer transformer;
private XPath xpath;
private String query;
private XPathExpression xprResultTotalPath;
private XPathExpression xprResumptionPath;
private XPathExpression xprEntity;
private final String queryFormat;
private final String querySize;
private final String authMethod;
private final String authToken;
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
private int discoverResultSize = 0;
private int pagination = 1;
/*
* While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in
* json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in
* json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format.
*/
private final String resultOutputFormat;
/** RestIterator class
* compatible to version 1.3.33
*/
public RestIterator(
final HttpClientParams clientParams,
final String baseUrl,
final String resumptionType,
final String resumptionParam,
final String resumptionXpath,
final String resultTotalXpath,
final String resultFormatParam,
final String resultFormatValue,
final String resultSizeParam,
final String resultSizeValueStr,
final String queryParams,
final String entityXpath,
final String authMethod,
final String authToken,
final String resultOutputFormat) {
this.clientParams = clientParams;
this.jsonUtils = new JsonUtils();
this.baseUrl = baseUrl;
this.resumptionType = resumptionType;
this.resumptionParam = resumptionParam;
this.resultFormatValue = resultFormatValue;
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
this.queryParams = queryParams;
this.authMethod = authMethod;
this.authToken = authToken;
this.resultOutputFormat = resultOutputFormat;
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
: "";
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
try {
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
} catch (Exception e) {
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
}
initQueue();
}
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
throws TransformerConfigurationException, XPathExpressionException {
transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
xpath = XPathFactory.newInstance().newXPath();
xprResultTotalPath = xpath.compile(resultTotalXpath);
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
xprEntity = xpath.compile(entityXpath);
}
private void initQueue() {
query = baseUrl + "?" + queryParams + querySize + queryFormat;
log.info("REST calls starting with " + query);
}
private void disconnect() {
// TODO close inputstream
}
/*
* (non-Javadoc)
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
if (recordQueue.isEmpty() && query.isEmpty()) {
disconnect();
return false;
} else {
return true;
}
}
/*
* (non-Javadoc)
* @see java.util.Iterator#next()
*/
@Override
public String next() {
synchronized (recordQueue) {
while (recordQueue.isEmpty() && !query.isEmpty()) {
try {
query = downloadPage(query);
} catch (CollectorException e) {
log.debug("CollectorPlugin.next()-Exception: " + e);
throw new RuntimeException(e);
}
}
return recordQueue.poll();
}
}
/*
* download page and return nextQuery
*/
private String downloadPage(String query) throws CollectorException {
String resultJson;
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
String nextQuery = "";
String emptyXml = resultXml + "<" + JsonUtils.wrapName + "></" + JsonUtils.wrapName + ">";
Node resultNode = null;
NodeList nodeList = null;
String qUrlArgument = "";
int urlOldResumptionSize = 0;
InputStream theHttpInputStream;
// check if cursor=* is initial set otherwise add it to the queryParam URL
if (resumptionType.equalsIgnoreCase("deep-cursor")) {
log.debug("check resumptionType deep-cursor and check cursor=*?" + query);
if (!query.contains("&cursor=")) {
query += "&cursor=*";
}
}
try {
log.info("requestig URL [{}]", query);
URL qUrl = new URL(query);
log.debug("authMethod :" + authMethod);
if ("bearer".equalsIgnoreCase(this.authMethod)) {
log.trace("authMethod before inputStream: " + resultXml);
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
log.trace("authMethod before inputStream: " + resultXml);
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else {
theHttpInputStream = qUrl.openStream();
}
resultStream = theHttpInputStream;
if ("json".equals(resultOutputFormat)) {
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
resultXml = jsonUtils.convertToXML(resultJson);
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
}
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
log.debug("nodeList.length: " + nodeList.getLength());
for (int i = 0; i < nodeList.getLength(); i++) {
StringWriter sw = new StringWriter();
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
String toEnqueue = sw.toString();
if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
log.warn("The following record resulted in empty item for the feeding queue: " + resultXml);
} else {
recordQueue.add(sw.toString());
}
}
} else {
log.warn("resultXml is equal with emptyXml");
}
resumptionInt += resultSizeValue;
switch (resumptionType.toLowerCase()) {
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
resumptionStr = xprResumptionPath.evaluate(resultNode);
break;
case "count": // begin at one step for all records, iterate over items
resumptionStr = Integer.toString(resumptionInt);
break;
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
if (resultSizeValue < 2) {
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
}
qUrlArgument = qUrl.getQuery();
String[] arrayQUrlArgument = qUrlArgument.split("&");
for (String arrayUrlArgStr : arrayQUrlArgument) {
if (arrayUrlArgStr.startsWith(resumptionParam)) {
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
if (isInteger(resumptionKeyValue[1])) {
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
} else {
log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
}
}
}
if (((emptyXml).equalsIgnoreCase(resultXml))
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
// resumptionStr = "";
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
}
resultTotal = discoverResultSize;
} else {
resumptionStr = Integer.toString(resumptionInt);
resultTotal = resumptionInt + 1;
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
}
}
log.info("discoverResultSize: {}", discoverResultSize);
break;
case "pagination":
case "page": // pagination, iterate over page numbers
pagination += 1;
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
} else {
resultTotal = discoverResultSize;
pagination = discoverResultSize;
}
resumptionInt = pagination;
resumptionStr = Integer.toString(resumptionInt);
break;
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
// solr)
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
// deep-cursor, Param 'resultSizeValue' is less than 2");}
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
queryParams = queryParams.replace("&cursor=*", "");
// terminating if length of nodeList is 0
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
} else {
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
// because the iteration is over
// real length and the
// resultSizeValue is added before
// the switch()
}
discoverResultSize = nodeList.getLength();
log
.debug(
"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
break;
default: // otherwise: abort
// resultTotal = resumptionInt;
break;
}
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new IllegalStateException("collection failed: " + e.getMessage());
}
try {
if (resultTotal == -1) {
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
resultTotal += 1;
} // to correct the upper bound
log.info("resultTotal was -1 is now: " + resultTotal);
}
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
}
log.debug("resultTotal: " + resultTotal);
log.debug("resInt: " + resumptionInt);
if (resumptionInt <= resultTotal) {
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
+ queryFormat;
} else {
nextQuery = "";
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
// resumptionInt and prevent a NullPointer Exception at mdStore
}
log.debug("nextQueryUrl: " + nextQuery);
return nextQuery;
}
private boolean isInteger(String s) {
boolean isValidInteger = false;
try {
Integer.parseInt(s);
// s is a valid integer
isValidInteger = true;
} catch (NumberFormatException ex) {
// s is not an integer
}
return isValidInteger;
}
// Method to encode a string value using `UTF-8` encoding scheme
private String encodeValue(String value) {
try {
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException(ex.getCause());
}
}
public String getResultFormatValue() {
return resultFormatValue;
}
public String getResultOutputFormat() {
return resultOutputFormat;
}
}

View File

@ -1,16 +1,16 @@
package eu.dnetlib.dhp.collection;
package eu.dnetlib.dhp.collection.worker;
public class CollectorException extends Exception {
public class DnetCollectorException extends Exception {
/** */
private static final long serialVersionUID = -290723075076039757L;
public CollectorException() {
public DnetCollectorException() {
super();
}
public CollectorException(
public DnetCollectorException(
final String message,
final Throwable cause,
final boolean enableSuppression,
@ -18,15 +18,15 @@ public class CollectorException extends Exception {
super(message, cause, enableSuppression, writableStackTrace);
}
public CollectorException(final String message, final Throwable cause) {
public DnetCollectorException(final String message, final Throwable cause) {
super(message, cause);
}
public CollectorException(final String message) {
public DnetCollectorException(final String message) {
super(message);
}
public CollectorException(final Throwable cause) {
public DnetCollectorException(final Throwable cause) {
super(cause);
}
}

View File

@ -0,0 +1,139 @@
package eu.dnetlib.dhp.collection.worker;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
import eu.dnetlib.message.MessageType;
public class DnetCollectorWorker {
private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class);
private final CollectorPluginFactory collectorPluginFactory;
private final ArgumentApplicationParser argumentParser;
private final MessageManager manager;
public DnetCollectorWorker(
final CollectorPluginFactory collectorPluginFactory,
final ArgumentApplicationParser argumentParser,
final MessageManager manager)
throws DnetCollectorException {
this.collectorPluginFactory = collectorPluginFactory;
this.argumentParser = argumentParser;
this.manager = manager;
}
public void collect() throws DnetCollectorException {
try {
final ObjectMapper jsonMapper = new ObjectMapper();
final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class);
final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol());
final String hdfsuri = argumentParser.get("namenode");
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsuri);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS"));
System.setProperty("hadoop.home.dir", "/");
// Get the filesystem - HDFS
FileSystem.get(URI.create(hdfsuri), conf);
Path hdfswritepath = new Path(argumentParser.get("hdfsPath"));
log.info("Created path " + hdfswritepath.toString());
final Map<String, String> ongoingMap = new HashMap<>();
final Map<String, String> reportMap = new HashMap<>();
final AtomicInteger counter = new AtomicInteger(0);
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(hdfswritepath),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class))) {
final IntWritable key = new IntWritable(counter.get());
final Text value = new Text();
plugin
.collect(api)
.forEach(
content -> {
key.set(counter.getAndIncrement());
value.set(content);
if (counter.get() % 10 == 0) {
try {
ongoingMap.put("ongoing", "" + counter.get());
log
.debug(
"Sending message: "
+ manager
.sendMessage(
new Message(
argumentParser.get("workflowId"),
"Collection",
MessageType.ONGOING,
ongoingMap),
argumentParser.get("rabbitOngoingQueue"),
true,
false));
} catch (Exception e) {
log.error("Error on sending message ", e);
}
}
try {
writer.append(key, value);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}
ongoingMap.put("ongoing", "" + counter.get());
manager
.sendMessage(
new Message(
argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap),
argumentParser.get("rabbitOngoingQueue"),
true,
false);
reportMap.put("collected", "" + counter.get());
manager
.sendMessage(
new Message(
argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
argumentParser.get("rabbitOngoingQueue"),
true,
false);
manager.close();
} catch (Throwable e) {
throw new DnetCollectorException("Error on collecting ", e);
}
}
}

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.collection.worker;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.MessageManager;
/**
* DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module
* will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector
* plugin to use and where store the data into HDFS path
*
* @author Sandro La Bruzzo
*/
public class DnetCollectorWorkerApplication {
private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class);
private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory();
private static ArgumentApplicationParser argumentParser;
/** @param args */
public static void main(final String[] args) throws Exception {
argumentParser = new ArgumentApplicationParser(
IOUtils
.toString(
DnetCollectorWorker.class
.getResourceAsStream(
"/eu/dnetlib/collector/worker/collector_parameter.json")));
argumentParser.parseArgument(args);
log.info("hdfsPath =" + argumentParser.get("hdfsPath"));
log.info("json = " + argumentParser.get("apidescriptor"));
final MessageManager manager = new MessageManager(
argumentParser.get("rabbitHost"),
argumentParser.get("rabbitUser"),
argumentParser.get("rabbitPassword"),
false,
false,
null);
final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager);
worker.collect();
}
}

View File

@ -0,0 +1,19 @@
package eu.dnetlib.dhp.collection.worker.utils;
import java.util.LinkedList;
public class CollectorPluginErrorLogList extends LinkedList<String> {
private static final long serialVersionUID = -6925786561303289704L;
@Override
public String toString() {
String log = "";
int index = 0;
for (final String errorMessage : this) {
log += String.format("Retry #%s: %s / ", index++, errorMessage);
}
return log;
}
}

View File

@ -0,0 +1,20 @@
package eu.dnetlib.dhp.collection.worker.utils;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
public class CollectorPluginFactory {
public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException {
if (protocol == null)
throw new DnetCollectorException("protocol cannot be null");
switch (protocol.toLowerCase().trim()) {
case "oai":
return new OaiCollectorPlugin();
default:
throw new DnetCollectorException("UNknown protocol");
}
}
}

View File

@ -0,0 +1,244 @@
package eu.dnetlib.dhp.collection.worker.utils;
import java.io.IOException;
import java.io.InputStream;
import java.net.*;
import java.security.GeneralSecurityException;
import java.security.cert.X509Certificate;
import java.util.List;
import java.util.Map;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
public class HttpConnector {
private static final Log log = LogFactory.getLog(HttpConnector.class);
private int maxNumberOfRetry = 6;
private int defaultDelay = 120; // seconds
private int readTimeOut = 120; // seconds
private String responseType = null;
private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
public HttpConnector() {
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
}
/**
* Given the URL returns the content via HTTP GET
*
* @param requestUrl the URL
* @return the content of the downloaded resource
* @throws DnetCollectorException when retrying more than maxNumberOfRetry times
*/
public String getInputSource(final String requestUrl) throws DnetCollectorException {
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
}
/**
* Given the URL returns the content as a stream via HTTP GET
*
* @param requestUrl the URL
* @return the content of the downloaded resource as InputStream
* @throws DnetCollectorException when retrying more than maxNumberOfRetry times
*/
public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException {
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
}
private String attemptDownlaodAsString(
final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
throws DnetCollectorException {
try {
final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
try {
return IOUtils.toString(s);
} catch (final IOException e) {
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
Thread.sleep(defaultDelay * 1000);
errorList.add(e.getMessage());
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
} finally {
IOUtils.closeQuietly(s);
}
} catch (final InterruptedException e) {
throw new DnetCollectorException(e);
}
}
private InputStream attemptDownload(
final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
throws DnetCollectorException {
if (retryNumber > maxNumberOfRetry) {
throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList);
}
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
try {
InputStream input = null;
try {
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
urlConn.setInstanceFollowRedirects(false);
urlConn.setReadTimeout(readTimeOut * 1000);
urlConn.addRequestProperty("User-Agent", userAgent);
if (log.isDebugEnabled()) {
logHeaderFields(urlConn);
}
final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
log.warn("waiting and repeating request after " + retryAfter + " sec.");
Thread.sleep(retryAfter * 1000);
errorList.add("503 Service Unavailable");
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, errorList);
} else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM
|| urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) {
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
log.debug("The requested url has been moved to " + newUrl);
errorList
.add(
String
.format(
"%s %s. Moved to: %s",
urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
urlConn.disconnect();
return attemptDownload(newUrl, retryNumber + 1, errorList);
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
log
.error(
String
.format(
"HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
Thread.sleep(defaultDelay * 1000);
errorList
.add(
String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, errorList);
} else {
input = urlConn.getInputStream();
responseType = urlConn.getContentType();
return input;
}
} catch (final IOException e) {
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
Thread.sleep(defaultDelay * 1000);
errorList.add(e.getMessage());
return attemptDownload(requestUrl, retryNumber + 1, errorList);
}
} catch (final InterruptedException e) {
throw new DnetCollectorException(e);
}
}
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
log.debug("StatusCode: " + urlConn.getResponseMessage());
for (final Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
if (e.getKey() != null) {
for (final String v : e.getValue()) {
log.debug(" key: " + e.getKey() + " - value: " + v);
}
}
}
}
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
for (final String key : headerMap.keySet()) {
if (key != null
&& key.toLowerCase().equals("retry-after")
&& headerMap.get(key).size() > 0
&& NumberUtils.isNumber(headerMap.get(key).get(0))) {
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
}
}
return -1;
}
private String obtainNewLocation(final Map<String, List<String>> headerMap)
throws DnetCollectorException {
for (final String key : headerMap.keySet()) {
if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) {
return headerMap.get(key).get(0);
}
}
throw new DnetCollectorException(
"The requested url has been MOVED, but 'location' param is MISSING");
}
/**
* register for https scheme; this is a workaround and not intended for the use in trusted environments
*/
public void initTrustManager() {
final X509TrustManager tm = new X509TrustManager() {
@Override
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
}
@Override
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
try {
final SSLContext ctx = SSLContext.getInstance("TLS");
ctx.init(null, new TrustManager[] {
tm
}, null);
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
} catch (final GeneralSecurityException e) {
log.fatal(e);
throw new IllegalStateException(e);
}
}
public int getMaxNumberOfRetry() {
return maxNumberOfRetry;
}
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
this.maxNumberOfRetry = maxNumberOfRetry;
}
public int getDefaultDelay() {
return defaultDelay;
}
public void setDefaultDelay(final int defaultDelay) {
this.defaultDelay = defaultDelay;
}
public int getReadTimeOut() {
return readTimeOut;
}
public void setReadTimeOut(final int readTimeOut) {
this.readTimeOut = readTimeOut;
}
public String getResponseType() {
return responseType;
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.collection;
package eu.dnetlib.dhp.collection.worker.utils;
import java.util.HashMap;
import java.util.HashSet;

View File

@ -1,29 +0,0 @@
package eu.dnetlib.dhp.transformation;
public class DnetTransformationException extends Exception {
public DnetTransformationException() {
super();
}
public DnetTransformationException(
final String message,
final Throwable cause,
final boolean enableSuppression,
final boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
public DnetTransformationException(final String message, final Throwable cause) {
super(message, cause);
}
public DnetTransformationException(final String message) {
super(message);
}
public DnetTransformationException(final Throwable cause) {
super(cause);
}
}

View File

@ -0,0 +1,74 @@
package eu.dnetlib.dhp.transformation;
import java.io.ByteArrayInputStream;
import java.io.StringWriter;
import java.util.Map;
import javax.xml.transform.stream.StreamSource;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.transformation.functions.Cleaner;
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import net.sf.saxon.s9api.*;
public class TransformFunction implements MapFunction<MetadataRecord, MetadataRecord> {
private final LongAccumulator totalItems;
private final LongAccumulator errorItems;
private final LongAccumulator transformedItems;
private final String transformationRule;
private final Cleaner cleanFunction;
private final long dateOfTransformation;
public TransformFunction(
LongAccumulator totalItems,
LongAccumulator errorItems,
LongAccumulator transformedItems,
final String transformationRule,
long dateOfTransformation,
final Map<String, Vocabulary> vocabularies)
throws Exception {
this.totalItems = totalItems;
this.errorItems = errorItems;
this.transformedItems = transformedItems;
this.transformationRule = transformationRule;
this.dateOfTransformation = dateOfTransformation;
cleanFunction = new Cleaner(vocabularies);
}
@Override
public MetadataRecord call(MetadataRecord value) {
totalItems.add(1);
try {
Processor processor = new Processor(false);
processor.registerExtensionFunction(cleanFunction);
final XsltCompiler comp = processor.newXsltCompiler();
XsltExecutable xslt = comp
.compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes())));
XdmNode source = processor
.newDocumentBuilder()
.build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes())));
XsltTransformer trans = xslt.load();
trans.setInitialContextNode(source);
final StringWriter output = new StringWriter();
Serializer out = processor.newSerializer(output);
out.setOutputProperty(Serializer.Property.METHOD, "xml");
out.setOutputProperty(Serializer.Property.INDENT, "yes");
trans.setDestination(out);
trans.transform();
final String xml = output.toString();
value.setBody(xml);
value.setDateOfTransformation(dateOfTransformation);
transformedItems.add(1);
return value;
} catch (Throwable e) {
errorItems.add(1);
return null;
}
}
}

View File

@ -1,43 +1,43 @@
package eu.dnetlib.dhp.transformation;
import static eu.dnetlib.dhp.common.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.io.IOException;
import java.io.ByteArrayInputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.cli.*;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.message.MessageSender;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
import eu.dnetlib.message.MessageType;
public class TransformSparkJobNode {
private static final Logger log = LoggerFactory.getLogger(TransformSparkJobNode.class);
private static final int RECORDS_PER_TASK = 200;
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -55,107 +55,67 @@ public class TransformSparkJobNode {
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String mdstoreInputVersion = parser.get("mdstoreInputVersion");
final String mdstoreOutputVersion = parser.get("mdstoreOutputVersion");
final String inputPath = parser.get("input");
final String outputPath = parser.get("output");
final String workflowId = parser.get("workflowId");
final String trasformationRule = extractXSLTFromTR(
Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule"))));
final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class);
final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH;
log.info("inputPath: {}", inputPath);
final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class);
final String outputBasePath = cleanedMdStoreVersion.getHdfsPath();
log.info("outputBasePath: {}", outputBasePath);
final String isLookupUrl = parser.get("isLookupUrl");
log.info(String.format("isLookupUrl: %s", isLookupUrl));
final String dateOfTransformation = parser.get("dateOfTransformation");
log.info(String.format("dateOfTransformation: %s", dateOfTransformation));
final Integer rpt = Optional
.ofNullable(parser.get("recordsPerTask"))
.map(Integer::valueOf)
.orElse(RECORDS_PER_TASK);
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService);
log.info("Retrieved {} vocabularies", vocabularies.vocabularyNames().size());
final String rabbitUser = parser.get("rabbitUser");
final String rabbitPassword = parser.get("rabbitPassword");
final String rabbitHost = parser.get("rabbitHost");
final String rabbitReportQueue = parser.get("rabbitReportQueue");
final long dateOfCollection = new Long(parser.get("dateOfCollection"));
final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
transformRecords(
parser.getObjectMap(), isLookupService, spark, inputPath, outputBasePath, rpt);
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
final Dataset<MetadataRecord> mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder);
final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems");
final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems");
final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems");
final Map<String, Vocabulary> vocabularies = new HashMap<>();
vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages"));
final TransformFunction transformFunction = new TransformFunction(
totalItems,
errorItems,
transformedItems,
trasformationRule,
dateOfCollection,
vocabularies);
mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath);
if (rabbitHost != null) {
System.out.println("SEND FINAL REPORT");
final Map<String, String> reportMap = new HashMap<>();
reportMap.put("inputItem", "" + totalItems.value());
reportMap.put("invalidRecords", "" + errorItems.value());
reportMap.put("mdStoreSize", "" + transformedItems.value());
System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap));
if (!test) {
final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false,
false,
null);
manager
.sendMessage(
new Message(workflowId, "Transform", MessageType.REPORT, reportMap),
rabbitReportQueue,
true,
false);
manager.close();
}
}
});
}
public static void transformRecords(final Map<String, String> args, final ISLookUpService isLookUpService,
final SparkSession spark, final String inputPath, final String outputBasePath, final Integer rpt)
throws DnetTransformationException, IOException {
final LongAccumulator totalItems = spark.sparkContext().longAccumulator(CONTENT_TOTALITEMS);
final LongAccumulator errorItems = spark.sparkContext().longAccumulator(CONTENT_INVALIDRECORDS);
final LongAccumulator transformedItems = spark.sparkContext().longAccumulator(CONTENT_TRANSFORMEDRECORDS);
final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems);
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
final String dnetMessageManagerURL = args.get(DNET_MESSAGE_MGR_URL);
log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL);
final String workflowId = args.get("workflowId");
log.info("workflowId is {}", workflowId);
MapFunction<MetadataRecord, MetadataRecord> x = TransformationFactory
.getTransformationPlugin(args, ct, isLookUpService);
final Dataset<MetadataRecord> inputMDStore = spark
.read()
.format("parquet")
.load(inputPath)
.as(encoder);
final long totalInput = inputMDStore.count();
final MessageSender messageSender = new MessageSender(dnetMessageManagerURL, workflowId);
try (AggregatorReport report = new AggregatorReport(messageSender)) {
try {
JavaRDD<MetadataRecord> mdstore = inputMDStore
.javaRDD()
.repartition(getRepartitionNumber(totalInput, rpt))
.map((Function<MetadataRecord, MetadataRecord>) x::call);
saveDataset(spark.createDataset(mdstore.rdd(), encoder), outputBasePath + MDSTORE_DATA_PATH);
log.info("Transformed item " + ct.getProcessedItems().count());
log.info("Total item " + ct.getTotalItems().count());
log.info("Transformation Error item " + ct.getErrorItems().count());
final long mdStoreSize = spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count();
writeHdfsFile(
spark.sparkContext().hadoopConfiguration(),
"" + mdStoreSize, outputBasePath + MDSTORE_SIZE_PATH);
} catch (Throwable e) {
log.error("error during record transformation", e);
report.put(TransformSparkJobNode.class.getSimpleName(), e.getMessage());
report.put(CONTENT_TOTALITEMS, ct.getTotalItems().value().toString());
report.put(CONTENT_INVALIDRECORDS, ct.getErrorItems().value().toString());
report.put(CONTENT_TRANSFORMEDRECORDS, ct.getProcessedItems().value().toString());
throw e;
}
}
private static String extractXSLTFromTR(final String tr) throws DocumentException {
SAXReader reader = new SAXReader();
Document document = reader.read(new ByteArrayInputStream(tr.getBytes()));
Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']");
return node.asXML();
}
/**
* Calculates the number of partitions allocating at most @rpt records for a single transformation task.
* @param totalInput
* @param rpt
* @return
*/
private static int getRepartitionNumber(long totalInput, Integer rpt) {
return Math.max(1, (int) (totalInput / rpt));
}
}

View File

@ -1,69 +0,0 @@
package eu.dnetlib.dhp.transformation;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class TransformationFactory {
private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class);
public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//RESOURCE_IDENTIFIER/@value = \"%s\" return $x//CODE/*[local-name() =\"stylesheet\"]";
public static MapFunction<MetadataRecord, MetadataRecord> getTransformationPlugin(
final Map<String, String> jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService)
throws DnetTransformationException {
try {
final String transformationPlugin = jobArgument.get("transformationPlugin");
log.info("Transformation plugin required " + transformationPlugin);
switch (transformationPlugin) {
case "XSLT_TRANSFORM": {
final String transformationRuleId = jobArgument.get("transformationRuleId");
if (StringUtils.isBlank(transformationRuleId))
throw new DnetTransformationException("Missing Parameter transformationRule");
final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService);
final String transformationRule = queryTransformationRuleFromIS(
transformationRuleId, isLookupService);
final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation"));
return new XSLTTransformationFunction(counters, transformationRule, dateOfTransformation,
vocabularies);
}
default:
throw new DnetTransformationException(
"transformation plugin does not exists for " + transformationPlugin);
}
} catch (Throwable e) {
throw new DnetTransformationException(e);
}
}
private static String queryTransformationRuleFromIS(final String transformationRuleId,
final ISLookUpService isLookUpService) throws Exception {
final String query = String.format(TRULE_XQUERY, transformationRuleId);
System.out.println("asking query to IS: " + query);
List<String> result = isLookUpService.quickSearchProfile(query);
if (result == null || result.isEmpty())
throw new DnetTransformationException(
"Unable to find transformation rule with name: " + transformationRuleId);
return result.get(0);
}
}

View File

@ -1,25 +1,25 @@
package eu.dnetlib.dhp.transformation.xslt;
package eu.dnetlib.dhp.transformation.functions;
import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
import java.util.Map;
import java.util.Optional;
import java.io.Serializable;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.transformation.vocabulary.Term;
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import net.sf.saxon.s9api.*;
import scala.Serializable;
public class Cleaner implements ExtensionFunction, Serializable {
private final VocabularyGroup vocabularies;
private final Map<String, Vocabulary> vocabularies;
public Cleaner(final VocabularyGroup vocabularies) {
public Cleaner(Map<String, Vocabulary> vocabularies) {
this.vocabularies = vocabularies;
}
@Override
public QName getName() {
return new QName(QNAME_BASE_URI + "/clean", "clean");
return new QName("http://eu/dnetlib/trasform/extension", "clean");
}
@Override
@ -30,22 +30,23 @@ public class Cleaner implements ExtensionFunction, Serializable {
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] {
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_MORE),
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE),
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE)
};
}
@Override
public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
XdmValue r = xdmValues[0];
if (r.size() == 0) {
return new XdmAtomicValue("");
}
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
final String vocabularyName = xdmValues[1].itemAt(0).getStringValue();
Qualifier cleanedValue = vocabularies.getSynonymAsQualifier(vocabularyName, currentValue);
Optional<Term> cleanedValue = vocabularies
.get(vocabularyName)
.getTerms()
.stream()
.filter(it -> it.getNativeName().equalsIgnoreCase(currentValue))
.findAny();
return new XdmAtomicValue(
cleanedValue != null ? cleanedValue.getClassid() : currentValue);
cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue);
}
}

View File

@ -0,0 +1,53 @@
package eu.dnetlib.dhp.transformation.vocabulary;
import java.io.Serializable;
public class Term implements Serializable {
private String englishName;
private String nativeName;
private String encoding;
private String code;
private String synonyms;
public String getEnglishName() {
return englishName;
}
public void setEnglishName(String englishName) {
this.englishName = englishName;
}
public String getNativeName() {
return nativeName;
}
public void setNativeName(String nativeName) {
this.nativeName = nativeName;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getSynonyms() {
return synonyms;
}
public void setSynonyms(String synonyms) {
this.synonyms = synonyms;
}
}

Some files were not shown because too many files have changed in this diff Show More