forked from antonis.lempesis/dnet-hadoop
fix conflicts
This commit is contained in:
commit
c73072079d
|
@ -7,6 +7,8 @@
|
||||||
*.iws
|
*.iws
|
||||||
*~
|
*~
|
||||||
.vscode
|
.vscode
|
||||||
|
.metals
|
||||||
|
.bloop
|
||||||
.classpath
|
.classpath
|
||||||
/*/.classpath
|
/*/.classpath
|
||||||
/*/*/.classpath
|
/*/*/.classpath
|
||||||
|
@ -24,4 +26,5 @@
|
||||||
spark-warehouse
|
spark-warehouse
|
||||||
/**/job-override.properties
|
/**/job-override.properties
|
||||||
/**/*.log
|
/**/*.log
|
||||||
|
/**/.factorypath
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,8 @@
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.2.4-SNAPSHOT</version>
|
<version>1.2.4-SNAPSHOT</version>
|
||||||
<relativePath>../</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
|
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>dhp-common</artifactId>
|
<artifactId>dhp-common</artifactId>
|
||||||
|
@ -29,12 +30,6 @@
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_2.11</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
|
||||||
<artifactId>dhp-schemas</artifactId>
|
|
||||||
<version>${project.version}</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-cli</groupId>
|
<groupId>commons-cli</groupId>
|
||||||
<artifactId>commons-cli</artifactId>
|
<artifactId>commons-cli</artifactId>
|
||||||
|
@ -59,11 +54,6 @@
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.rabbitmq</groupId>
|
|
||||||
<artifactId>amqp-client</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>net.sf.saxon</groupId>
|
<groupId>net.sf.saxon</groupId>
|
||||||
<artifactId>Saxon-HE</artifactId>
|
<artifactId>Saxon-HE</artifactId>
|
||||||
|
@ -104,6 +94,21 @@
|
||||||
<artifactId>dnet-pace-core</artifactId>
|
<artifactId>dnet-pace-core</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
|
<artifactId>httpclient</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.mongodb</groupId>
|
||||||
|
<artifactId>mongo-java-driver</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-schemas</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -2,12 +2,16 @@
|
||||||
package eu.dnetlib.data.mdstore.manager.common.model;
|
package eu.dnetlib.data.mdstore.manager.common.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
import javax.persistence.Column;
|
import javax.persistence.Column;
|
||||||
import javax.persistence.Entity;
|
import javax.persistence.Entity;
|
||||||
import javax.persistence.Id;
|
import javax.persistence.Id;
|
||||||
import javax.persistence.Table;
|
import javax.persistence.Table;
|
||||||
|
import javax.persistence.Temporal;
|
||||||
|
import javax.persistence.TemporalType;
|
||||||
|
|
||||||
@Entity
|
@Entity
|
||||||
@Table(name = "mdstores")
|
@Table(name = "mdstores")
|
||||||
|
@ -38,6 +42,13 @@ public class MDStore implements Serializable {
|
||||||
@Column(name = "api_id")
|
@Column(name = "api_id")
|
||||||
private String apiId;
|
private String apiId;
|
||||||
|
|
||||||
|
@Column(name = "hdfs_path")
|
||||||
|
private String hdfsPath;
|
||||||
|
|
||||||
|
@Column(name = "creation_date")
|
||||||
|
@Temporal(TemporalType.TIMESTAMP)
|
||||||
|
private Date creationDate;
|
||||||
|
|
||||||
public String getId() {
|
public String getId() {
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
@ -94,9 +105,28 @@ public class MDStore implements Serializable {
|
||||||
this.apiId = apiId;
|
this.apiId = apiId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getHdfsPath() {
|
||||||
|
return hdfsPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHdfsPath(final String hdfsPath) {
|
||||||
|
this.hdfsPath = hdfsPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Date getCreationDate() {
|
||||||
|
return creationDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCreationDate(final Date creationDate) {
|
||||||
|
this.creationDate = creationDate;
|
||||||
|
}
|
||||||
|
|
||||||
public static MDStore newInstance(
|
public static MDStore newInstance(
|
||||||
final String format, final String layout, final String interpretation) {
|
final String format,
|
||||||
return newInstance(format, layout, interpretation, null, null, null);
|
final String layout,
|
||||||
|
final String interpretation,
|
||||||
|
final String hdfsBasePath) {
|
||||||
|
return newInstance(format, layout, interpretation, null, null, null, hdfsBasePath);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static MDStore newInstance(
|
public static MDStore newInstance(
|
||||||
|
@ -105,15 +135,48 @@ public class MDStore implements Serializable {
|
||||||
final String interpretation,
|
final String interpretation,
|
||||||
final String dsName,
|
final String dsName,
|
||||||
final String dsId,
|
final String dsId,
|
||||||
final String apiId) {
|
final String apiId,
|
||||||
|
final String hdfsBasePath) {
|
||||||
|
|
||||||
|
final String mdId = "md-" + UUID.randomUUID();
|
||||||
|
|
||||||
final MDStore md = new MDStore();
|
final MDStore md = new MDStore();
|
||||||
md.setId("md-" + UUID.randomUUID());
|
md.setId(mdId);
|
||||||
md.setFormat(format);
|
md.setFormat(format);
|
||||||
md.setLayout(layout);
|
md.setLayout(layout);
|
||||||
md.setInterpretation(interpretation);
|
md.setInterpretation(interpretation);
|
||||||
|
md.setCreationDate(new Date());
|
||||||
md.setDatasourceName(dsName);
|
md.setDatasourceName(dsName);
|
||||||
md.setDatasourceId(dsId);
|
md.setDatasourceId(dsId);
|
||||||
md.setApiId(apiId);
|
md.setApiId(apiId);
|
||||||
|
md.setHdfsPath(String.format("%s/%s", hdfsBasePath, mdId));
|
||||||
|
|
||||||
return md;
|
return md;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return String
|
||||||
|
.format(
|
||||||
|
"MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]",
|
||||||
|
id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(final Object obj) {
|
||||||
|
if (this == obj) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!(obj instanceof MDStore)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
final MDStore other = (MDStore) obj;
|
||||||
|
return Objects.equals(id, other.id);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
package eu.dnetlib.data.mdstore.manager.common.model;
|
package eu.dnetlib.data.mdstore.manager.common.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
import javax.persistence.Column;
|
import javax.persistence.Column;
|
||||||
import javax.persistence.Entity;
|
import javax.persistence.Entity;
|
||||||
|
@ -48,4 +49,26 @@ public class MDStoreCurrentVersion implements Serializable {
|
||||||
public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) {
|
public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) {
|
||||||
return newInstance(v.getMdstore(), v.getId());
|
return newInstance(v.getMdstore(), v.getId());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return String.format("MDStoreCurrentVersion [mdstore=%s, currentVersion=%s]", mdstore, currentVersion);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(currentVersion, mdstore);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(final Object obj) {
|
||||||
|
if (this == obj) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!(obj instanceof MDStoreCurrentVersion)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj;
|
||||||
|
return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
import javax.persistence.Column;
|
import javax.persistence.Column;
|
||||||
import javax.persistence.Entity;
|
import javax.persistence.Entity;
|
||||||
|
@ -38,15 +39,22 @@ public class MDStoreVersion implements Serializable {
|
||||||
@Column(name = "size")
|
@Column(name = "size")
|
||||||
private long size = 0;
|
private long size = 0;
|
||||||
|
|
||||||
public static MDStoreVersion newInstance(final String mdId, final boolean writing) {
|
@Column(name = "hdfs_path")
|
||||||
final MDStoreVersion t = new MDStoreVersion();
|
private String hdfsPath;
|
||||||
t.setId(mdId + "-" + new Date().getTime());
|
|
||||||
t.setMdstore(mdId);
|
public static MDStoreVersion newInstance(final String mdId, final boolean writing, final String hdfsBasePath) {
|
||||||
t.setLastUpdate(null);
|
final MDStoreVersion v = new MDStoreVersion();
|
||||||
t.setWriting(writing);
|
|
||||||
t.setReadCount(0);
|
final String versionId = mdId + "-" + new Date().getTime();
|
||||||
t.setSize(0);
|
v.setId(versionId);
|
||||||
return t;
|
v.setMdstore(mdId);
|
||||||
|
v.setLastUpdate(null);
|
||||||
|
v.setWriting(writing);
|
||||||
|
v.setReadCount(0);
|
||||||
|
v.setSize(0);
|
||||||
|
v.setHdfsPath(String.format("%s/%s/%s", hdfsBasePath, mdId, versionId));
|
||||||
|
|
||||||
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getId() {
|
public String getId() {
|
||||||
|
@ -96,4 +104,37 @@ public class MDStoreVersion implements Serializable {
|
||||||
public void setSize(final long size) {
|
public void setSize(final long size) {
|
||||||
this.size = size;
|
this.size = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getHdfsPath() {
|
||||||
|
return hdfsPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHdfsPath(final String hdfsPath) {
|
||||||
|
this.hdfsPath = hdfsPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return String
|
||||||
|
.format(
|
||||||
|
"MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id,
|
||||||
|
mdstore, writing, readCount, lastUpdate, size, hdfsPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(final Object obj) {
|
||||||
|
if (this == obj) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!(obj instanceof MDStoreVersion)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
final MDStoreVersion other = (MDStoreVersion) obj;
|
||||||
|
return Objects.equals(id, other.id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
import javax.persistence.Column;
|
import javax.persistence.Column;
|
||||||
import javax.persistence.Entity;
|
import javax.persistence.Entity;
|
||||||
|
@ -43,6 +44,10 @@ public class MDStoreWithInfo implements Serializable {
|
||||||
@Column(name = "current_version")
|
@Column(name = "current_version")
|
||||||
private String currentVersion;
|
private String currentVersion;
|
||||||
|
|
||||||
|
@Column(name = "creation_date")
|
||||||
|
@Temporal(TemporalType.TIMESTAMP)
|
||||||
|
private Date creationDate;
|
||||||
|
|
||||||
@Column(name = "lastupdate")
|
@Column(name = "lastupdate")
|
||||||
@Temporal(TemporalType.TIMESTAMP)
|
@Temporal(TemporalType.TIMESTAMP)
|
||||||
private Date lastUpdate;
|
private Date lastUpdate;
|
||||||
|
@ -53,6 +58,9 @@ public class MDStoreWithInfo implements Serializable {
|
||||||
@Column(name = "n_versions")
|
@Column(name = "n_versions")
|
||||||
private long numberOfVersions = 0;
|
private long numberOfVersions = 0;
|
||||||
|
|
||||||
|
@Column(name = "hdfs_path")
|
||||||
|
private String hdfsPath;
|
||||||
|
|
||||||
public String getId() {
|
public String getId() {
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
@ -117,6 +125,14 @@ public class MDStoreWithInfo implements Serializable {
|
||||||
this.currentVersion = currentVersion;
|
this.currentVersion = currentVersion;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Date getCreationDate() {
|
||||||
|
return creationDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCreationDate(final Date creationDate) {
|
||||||
|
this.creationDate = creationDate;
|
||||||
|
}
|
||||||
|
|
||||||
public Date getLastUpdate() {
|
public Date getLastUpdate() {
|
||||||
return lastUpdate;
|
return lastUpdate;
|
||||||
}
|
}
|
||||||
|
@ -140,4 +156,39 @@ public class MDStoreWithInfo implements Serializable {
|
||||||
public void setNumberOfVersions(final long numberOfVersions) {
|
public void setNumberOfVersions(final long numberOfVersions) {
|
||||||
this.numberOfVersions = numberOfVersions;
|
this.numberOfVersions = numberOfVersions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getHdfsPath() {
|
||||||
|
return hdfsPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHdfsPath(final String hdfsPath) {
|
||||||
|
this.hdfsPath = hdfsPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return String
|
||||||
|
.format(
|
||||||
|
"MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]",
|
||||||
|
id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate,
|
||||||
|
lastUpdate, size, numberOfVersions, hdfsPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(final Object obj) {
|
||||||
|
if (this == obj) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!(obj instanceof MDStoreWithInfo)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
final MDStoreWithInfo other = (MDStoreWithInfo) obj;
|
||||||
|
return Objects.equals(id, other.id);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.application;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
|
public class ApplicationUtils {
|
||||||
|
|
||||||
|
}
|
|
@ -1,10 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.application;
|
package eu.dnetlib.dhp.application;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.*;
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.io.StringWriter;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
import java.util.zip.GZIPOutputStream;
|
import java.util.zip.GZIPOutputStream;
|
||||||
|
@ -12,17 +9,21 @@ import java.util.zip.GZIPOutputStream;
|
||||||
import org.apache.commons.cli.*;
|
import org.apache.commons.cli.*;
|
||||||
import org.apache.commons.codec.binary.Base64;
|
import org.apache.commons.codec.binary.Base64;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
public class ArgumentApplicationParser implements Serializable {
|
public class ArgumentApplicationParser implements Serializable {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
|
||||||
|
|
||||||
private final Options options = new Options();
|
private final Options options = new Options();
|
||||||
private final Map<String, String> objectMap = new HashMap<>();
|
private final Map<String, String> objectMap = new HashMap<>();
|
||||||
|
|
||||||
private final List<String> compressedValues = new ArrayList<>();
|
private final List<String> compressedValues = new ArrayList<>();
|
||||||
|
|
||||||
public ArgumentApplicationParser(final String json_configuration) throws Exception {
|
public ArgumentApplicationParser(final String json_configuration) throws IOException {
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
|
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
|
||||||
createOptionMap(configuration);
|
createOptionMap(configuration);
|
||||||
|
@ -33,7 +34,6 @@ public class ArgumentApplicationParser implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void createOptionMap(final OptionsParameter[] configuration) {
|
private void createOptionMap(final OptionsParameter[] configuration) {
|
||||||
|
|
||||||
Arrays
|
Arrays
|
||||||
.stream(configuration)
|
.stream(configuration)
|
||||||
.map(
|
.map(
|
||||||
|
@ -47,10 +47,6 @@ public class ArgumentApplicationParser implements Serializable {
|
||||||
return o;
|
return o;
|
||||||
})
|
})
|
||||||
.forEach(options::addOption);
|
.forEach(options::addOption);
|
||||||
|
|
||||||
// HelpFormatter formatter = new HelpFormatter();
|
|
||||||
// formatter.printHelp("myapp", null, options, null, true);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String decompressValue(final String abstractCompressed) {
|
public static String decompressValue(final String abstractCompressed) {
|
||||||
|
@ -61,7 +57,7 @@ public class ArgumentApplicationParser implements Serializable {
|
||||||
IOUtils.copy(gis, stringWriter);
|
IOUtils.copy(gis, stringWriter);
|
||||||
return stringWriter.toString();
|
return stringWriter.toString();
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
System.out.println("Wrong value to decompress:" + abstractCompressed);
|
log.error("Wrong value to decompress:" + abstractCompressed);
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -74,7 +70,7 @@ public class ArgumentApplicationParser implements Serializable {
|
||||||
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
|
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void parseArgument(final String[] args) throws Exception {
|
public void parseArgument(final String[] args) throws ParseException {
|
||||||
CommandLineParser parser = new BasicParser();
|
CommandLineParser parser = new BasicParser();
|
||||||
CommandLine cmd = parser.parse(options, args);
|
CommandLine cmd = parser.parse(options, args);
|
||||||
Arrays
|
Arrays
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.collector.worker.model;
|
package eu.dnetlib.dhp.collection;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
|
@ -27,4 +27,24 @@ public class Constants {
|
||||||
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
|
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final String SEQUENCE_FILE_NAME = "/sequence_file";
|
||||||
|
public static final String REPORT_FILE_NAME = "/report";
|
||||||
|
public static final String MDSTORE_DATA_PATH = "/store";
|
||||||
|
public static final String MDSTORE_SIZE_PATH = "/size";
|
||||||
|
|
||||||
|
public static final String COLLECTION_MODE = "collectionMode";
|
||||||
|
public static final String METADATA_ENCODING = "metadataEncoding";
|
||||||
|
public static final String OOZIE_WF_PATH = "oozieWfPath";
|
||||||
|
public static final String DNET_MESSAGE_MGR_URL = "dnetMessageManagerURL";
|
||||||
|
|
||||||
|
public static final String MAX_NUMBER_OF_RETRY = "maxNumberOfRetry";
|
||||||
|
public static final String REQUEST_DELAY = "requestDelay";
|
||||||
|
public static final String RETRY_DELAY = "retryDelay";
|
||||||
|
public static final String CONNECT_TIMEOUT = "connectTimeOut";
|
||||||
|
public static final String READ_TIMEOUT = "readTimeOut";
|
||||||
|
|
||||||
|
public static final String CONTENT_TOTALITEMS = "TotalItems";
|
||||||
|
public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
|
||||||
|
public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,39 +1,60 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.bson.Document;
|
import org.bson.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
|
import com.mongodb.BasicDBObject;
|
||||||
import com.mongodb.MongoClient;
|
import com.mongodb.MongoClient;
|
||||||
import com.mongodb.MongoClientURI;
|
import com.mongodb.MongoClientURI;
|
||||||
|
import com.mongodb.QueryBuilder;
|
||||||
import com.mongodb.client.MongoCollection;
|
import com.mongodb.client.MongoCollection;
|
||||||
import com.mongodb.client.MongoDatabase;
|
import com.mongodb.client.MongoDatabase;
|
||||||
|
|
||||||
public class MdstoreClient implements Closeable {
|
public class MdstoreClient implements Closeable {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(MdstoreClient.class);
|
||||||
|
|
||||||
private final MongoClient client;
|
private final MongoClient client;
|
||||||
private final MongoDatabase db;
|
private final MongoDatabase db;
|
||||||
|
|
||||||
private static final String COLL_METADATA = "metadata";
|
private static final String COLL_METADATA = "metadata";
|
||||||
private static final String COLL_METADATA_MANAGER = "metadataManager";
|
private static final String COLL_METADATA_MANAGER = "metadataManager";
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(MdstoreClient.class);
|
|
||||||
|
|
||||||
public MdstoreClient(final String baseUrl, final String dbName) {
|
public MdstoreClient(final String baseUrl, final String dbName) {
|
||||||
this.client = new MongoClient(new MongoClientURI(baseUrl));
|
this.client = new MongoClient(new MongoClientURI(baseUrl));
|
||||||
this.db = getDb(client, dbName);
|
this.db = getDb(client, dbName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public MongoCollection<Document> mdStore(final String mdId) {
|
||||||
|
BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();
|
||||||
|
|
||||||
|
log.info("querying current mdId: {}", query.toJson());
|
||||||
|
|
||||||
|
final String currentId = Optional
|
||||||
|
.ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query))
|
||||||
|
.map(r -> r.first())
|
||||||
|
.map(d -> d.getString("currentId"))
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId));
|
||||||
|
|
||||||
|
log.info("currentId: {}", currentId);
|
||||||
|
|
||||||
|
return getColl(db, currentId, true);
|
||||||
|
}
|
||||||
|
|
||||||
public Map<String, String> validCollections(
|
public Map<String, String> validCollections(
|
||||||
final String mdFormat, final String mdLayout, final String mdInterpretation) {
|
final String mdFormat, final String mdLayout, final String mdInterpretation) {
|
||||||
|
|
|
@ -0,0 +1,72 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.rest;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.client.methods.HttpGet;
|
||||||
|
import org.apache.http.client.methods.HttpPost;
|
||||||
|
import org.apache.http.client.methods.HttpUriRequest;
|
||||||
|
import org.apache.http.entity.StringEntity;
|
||||||
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
|
import org.apache.http.impl.client.HttpClients;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
public class DNetRestClient {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
|
||||||
|
|
||||||
|
private static ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
|
||||||
|
final HttpGet httpGet = new HttpGet(url);
|
||||||
|
return doHTTPRequest(httpGet, clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String doGET(final String url) throws Exception {
|
||||||
|
final HttpGet httpGet = new HttpGet(url);
|
||||||
|
return doHTTPRequest(httpGet);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <V> String doPOST(final String url, V objParam) throws Exception {
|
||||||
|
final HttpPost httpPost = new HttpPost(url);
|
||||||
|
|
||||||
|
if (objParam != null) {
|
||||||
|
final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam));
|
||||||
|
httpPost.setEntity(entity);
|
||||||
|
httpPost.setHeader("Accept", "application/json");
|
||||||
|
httpPost.setHeader("Content-type", "application/json");
|
||||||
|
}
|
||||||
|
return doHTTPRequest(httpPost);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws Exception {
|
||||||
|
return mapper.readValue(doPOST(url, objParam), clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String doHTTPRequest(final HttpUriRequest r) throws Exception {
|
||||||
|
CloseableHttpClient client = HttpClients.createDefault();
|
||||||
|
|
||||||
|
log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString());
|
||||||
|
log
|
||||||
|
.info(
|
||||||
|
"request headers: {}",
|
||||||
|
Arrays
|
||||||
|
.asList(r.getAllHeaders())
|
||||||
|
.stream()
|
||||||
|
.map(h -> h.getName() + ":" + h.getValue())
|
||||||
|
.collect(Collectors.joining(",")));
|
||||||
|
|
||||||
|
CloseableHttpResponse response = client.execute(r);
|
||||||
|
return IOUtils.toString(response.getEntity().getContent());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {
|
||||||
|
return mapper.readValue(doHTTPRequest(r), clazz);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
package eu.dnetlib.dhp.common.vocabulary;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
package eu.dnetlib.dhp.common.vocabulary;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -67,6 +67,10 @@ public class VocabularyGroup implements Serializable {
|
||||||
|
|
||||||
private final Map<String, Vocabulary> vocs = new HashMap<>();
|
private final Map<String, Vocabulary> vocs = new HashMap<>();
|
||||||
|
|
||||||
|
public Set<String> vocabularyNames() {
|
||||||
|
return vocs.keySet();
|
||||||
|
}
|
||||||
|
|
||||||
public void addVocabulary(final String id, final String name) {
|
public void addVocabulary(final String id, final String name) {
|
||||||
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
|
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
|
||||||
}
|
}
|
||||||
|
@ -118,7 +122,31 @@ public class VocabularyGroup implements Serializable {
|
||||||
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
|
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* getSynonymAsQualifierCaseSensitive
|
||||||
|
*
|
||||||
|
* refelects the situation to check caseSensitive vocabulary
|
||||||
|
*/
|
||||||
|
public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) {
|
||||||
|
if (StringUtils.isBlank(vocId)) {
|
||||||
|
return OafMapperUtils.unknown("", "");
|
||||||
|
}
|
||||||
|
return vocs.get(vocId).getSynonymAsQualifier(syn);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* termExists
|
||||||
|
*
|
||||||
|
* two methods: without and with caseSensitive check
|
||||||
|
*/
|
||||||
public boolean termExists(final String vocId, final String id) {
|
public boolean termExists(final String vocId, final String id) {
|
||||||
|
return termExists(vocId, id, Boolean.FALSE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean termExists(final String vocId, final String id, final Boolean caseSensitive) {
|
||||||
|
if (Boolean.TRUE.equals(caseSensitive)) {
|
||||||
|
return vocabularyExists(vocId) && vocs.get(vocId).termExists(id);
|
||||||
|
}
|
||||||
return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id);
|
return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
package eu.dnetlib.dhp.common.vocabulary;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.message;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class Message implements Serializable {
|
||||||
|
|
||||||
|
private static final long serialVersionUID = 401753881204524893L;
|
||||||
|
|
||||||
|
public static String CURRENT_PARAM = "current";
|
||||||
|
public static String TOTAL_PARAM = "total";
|
||||||
|
|
||||||
|
private MessageType messageType;
|
||||||
|
|
||||||
|
private String workflowId;
|
||||||
|
|
||||||
|
private Map<String, String> body;
|
||||||
|
|
||||||
|
public Message() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public Message(final MessageType messageType, final String workflowId) {
|
||||||
|
this(messageType, workflowId, new LinkedHashMap<>());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Message(final MessageType messageType, final String workflowId, final Map<String, String> body) {
|
||||||
|
this.messageType = messageType;
|
||||||
|
this.workflowId = workflowId;
|
||||||
|
this.body = body;
|
||||||
|
}
|
||||||
|
|
||||||
|
public MessageType getMessageType() {
|
||||||
|
return messageType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMessageType(MessageType messageType) {
|
||||||
|
this.messageType = messageType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getWorkflowId() {
|
||||||
|
return workflowId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setWorkflowId(final String workflowId) {
|
||||||
|
this.workflowId = workflowId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, String> getBody() {
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBody(final Map<String, String> body) {
|
||||||
|
this.body = body;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return String.format("Message [type=%s, workflowId=%s, body=%s]", messageType, workflowId, body);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,94 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.message;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
|
import org.apache.http.client.config.RequestConfig;
|
||||||
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.client.methods.HttpPut;
|
||||||
|
import org.apache.http.entity.ContentType;
|
||||||
|
import org.apache.http.entity.StringEntity;
|
||||||
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
|
import org.apache.http.impl.client.HttpClients;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
public class MessageSender {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(MessageSender.class);
|
||||||
|
|
||||||
|
private static final int SOCKET_TIMEOUT_MS = 2000;
|
||||||
|
|
||||||
|
private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000;
|
||||||
|
|
||||||
|
private static final int CONNTECTION_TIMEOUT_MS = 2000;
|
||||||
|
|
||||||
|
private final ObjectMapper objectMapper = new ObjectMapper();
|
||||||
|
|
||||||
|
private final String dnetMessageEndpoint;
|
||||||
|
|
||||||
|
private final String workflowId;
|
||||||
|
|
||||||
|
private ExecutorService executorService = Executors.newCachedThreadPool();
|
||||||
|
|
||||||
|
public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
|
||||||
|
this.workflowId = workflowId;
|
||||||
|
this.dnetMessageEndpoint = dnetMessageEndpoint;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void sendMessage(final Message message) {
|
||||||
|
executorService.submit(() -> _sendMessage(message));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void sendMessage(final Long current, final Long total) {
|
||||||
|
sendMessage(createOngoingMessage(current, total));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void sendReport(final Map<String, String> report) {
|
||||||
|
sendMessage(new Message(MessageType.REPORT, workflowId, report));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Message createOngoingMessage(final Long current, final Long total) {
|
||||||
|
final Message m = new Message(MessageType.ONGOING, workflowId);
|
||||||
|
m.getBody().put(Message.CURRENT_PARAM, current.toString());
|
||||||
|
if (total != null) {
|
||||||
|
m.getBody().put(Message.TOTAL_PARAM, total.toString());
|
||||||
|
}
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void _sendMessage(final Message message) {
|
||||||
|
try {
|
||||||
|
final String json = objectMapper.writeValueAsString(message);
|
||||||
|
|
||||||
|
final HttpPut req = new HttpPut(dnetMessageEndpoint);
|
||||||
|
req.setEntity(new StringEntity(json, ContentType.APPLICATION_JSON));
|
||||||
|
|
||||||
|
final RequestConfig requestConfig = RequestConfig
|
||||||
|
.custom()
|
||||||
|
.setConnectTimeout(CONNTECTION_TIMEOUT_MS)
|
||||||
|
.setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS)
|
||||||
|
.setSocketTimeout(SOCKET_TIMEOUT_MS)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
try (final CloseableHttpClient client = HttpClients
|
||||||
|
.custom()
|
||||||
|
.setDefaultRequestConfig(requestConfig)
|
||||||
|
.build();
|
||||||
|
final CloseableHttpResponse response = client.execute(req)) {
|
||||||
|
log.debug("Sent Message to " + dnetMessageEndpoint);
|
||||||
|
log.debug("MESSAGE:" + message);
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
|
||||||
|
}
|
||||||
|
} catch (final JsonProcessingException e) {
|
||||||
|
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,21 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.message;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
public enum MessageType implements Serializable {
|
||||||
|
|
||||||
|
ONGOING, REPORT;
|
||||||
|
|
||||||
|
public MessageType from(String value) {
|
||||||
|
return Optional
|
||||||
|
.ofNullable(value)
|
||||||
|
.map(StringUtils::upperCase)
|
||||||
|
.map(MessageType::valueOf)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("unknown message type: " + value));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,18 +1,29 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.utils;
|
package eu.dnetlib.dhp.utils;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.*;
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.security.MessageDigest;
|
import java.security.MessageDigest;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Properties;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
import java.util.zip.GZIPOutputStream;
|
import java.util.zip.GZIPOutputStream;
|
||||||
|
|
||||||
import org.apache.commons.codec.binary.Base64;
|
import org.apache.commons.codec.binary.Base64;
|
||||||
import org.apache.commons.codec.binary.Base64OutputStream;
|
import org.apache.commons.codec.binary.Base64OutputStream;
|
||||||
import org.apache.commons.codec.binary.Hex;
|
import org.apache.commons.codec.binary.Hex;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
import com.jayway.jsonpath.JsonPath;
|
import com.jayway.jsonpath.JsonPath;
|
||||||
|
|
||||||
import net.minidev.json.JSONArray;
|
import net.minidev.json.JSONArray;
|
||||||
|
@ -21,6 +32,8 @@ import scala.collection.Seq;
|
||||||
|
|
||||||
public class DHPUtils {
|
public class DHPUtils {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(DHPUtils.class);
|
||||||
|
|
||||||
public static Seq<String> toSeq(List<String> list) {
|
public static Seq<String> toSeq(List<String> list) {
|
||||||
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
|
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
|
||||||
}
|
}
|
||||||
|
@ -79,4 +92,72 @@ public class DHPUtils {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final ObjectMapper MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void writeHdfsFile(final Configuration conf, final String content, final String path)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
log.info("writing file {}, size {}", path, content.length());
|
||||||
|
try (FileSystem fs = FileSystem.get(conf);
|
||||||
|
BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
|
||||||
|
os.write(content.getBytes(StandardCharsets.UTF_8));
|
||||||
|
os.flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String readHdfsFile(Configuration conf, String path) throws IOException {
|
||||||
|
log.info("reading file {}", path);
|
||||||
|
|
||||||
|
try (FileSystem fs = FileSystem.get(conf)) {
|
||||||
|
final Path p = new Path(path);
|
||||||
|
if (!fs.exists(p)) {
|
||||||
|
throw new FileNotFoundException(path);
|
||||||
|
}
|
||||||
|
return IOUtils.toString(fs.open(p));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T> T readHdfsFileAs(Configuration conf, String path, Class<T> clazz) throws IOException {
|
||||||
|
return MAPPER.readValue(readHdfsFile(conf, path), clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
|
||||||
|
log.info("saving dataset in: {}", targetPath);
|
||||||
|
mdstore
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.format("parquet")
|
||||||
|
.save(targetPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Configuration getHadoopConfiguration(String nameNode) {
|
||||||
|
// ====== Init HDFS File System Object
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
// Set FileSystem URI
|
||||||
|
conf.set("fs.defaultFS", nameNode);
|
||||||
|
// Because of Maven
|
||||||
|
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||||
|
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||||
|
|
||||||
|
System.setProperty("hadoop.home.dir", "/");
|
||||||
|
return conf;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void populateOOZIEEnv(final Map<String, String> report) throws IOException {
|
||||||
|
File file = new File(System.getProperty("oozie.action.output.properties"));
|
||||||
|
Properties props = new Properties();
|
||||||
|
report.forEach((k, v) -> props.setProperty(k, v));
|
||||||
|
|
||||||
|
try (OutputStream os = new FileOutputStream(file)) {
|
||||||
|
props.store(os, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void populateOOZIEEnv(final String paramName, String value) throws IOException {
|
||||||
|
Map<String, String> report = Maps.newHashMap();
|
||||||
|
report.put(paramName, value);
|
||||||
|
|
||||||
|
populateOOZIEEnv(report);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,76 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.message;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
public class Message {
|
|
||||||
|
|
||||||
private String workflowId;
|
|
||||||
|
|
||||||
private String jobName;
|
|
||||||
|
|
||||||
private MessageType type;
|
|
||||||
|
|
||||||
private Map<String, String> body;
|
|
||||||
|
|
||||||
public static Message fromJson(final String json) throws IOException {
|
|
||||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
|
||||||
return jsonMapper.readValue(json, Message.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Message() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public Message(String workflowId, String jobName, MessageType type, Map<String, String> body) {
|
|
||||||
this.workflowId = workflowId;
|
|
||||||
this.jobName = jobName;
|
|
||||||
this.type = type;
|
|
||||||
this.body = body;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getWorkflowId() {
|
|
||||||
return workflowId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setWorkflowId(String workflowId) {
|
|
||||||
this.workflowId = workflowId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getJobName() {
|
|
||||||
return jobName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setJobName(String jobName) {
|
|
||||||
this.jobName = jobName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public MessageType getType() {
|
|
||||||
return type;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setType(MessageType type) {
|
|
||||||
this.type = type;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Map<String, String> getBody() {
|
|
||||||
return body;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setBody(Map<String, String> body) {
|
|
||||||
this.body = body;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
|
||||||
try {
|
|
||||||
return jsonMapper.writeValueAsString(this);
|
|
||||||
} catch (JsonProcessingException e) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,47 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.message;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
|
||||||
|
|
||||||
import com.rabbitmq.client.AMQP;
|
|
||||||
import com.rabbitmq.client.Channel;
|
|
||||||
import com.rabbitmq.client.DefaultConsumer;
|
|
||||||
import com.rabbitmq.client.Envelope;
|
|
||||||
|
|
||||||
public class MessageConsumer extends DefaultConsumer {
|
|
||||||
|
|
||||||
final LinkedBlockingQueue<Message> queueMessages;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructs a new instance and records its association to the passed-in channel.
|
|
||||||
*
|
|
||||||
* @param channel the channel to which this consumer is attached
|
|
||||||
* @param queueMessages
|
|
||||||
*/
|
|
||||||
public MessageConsumer(Channel channel, LinkedBlockingQueue<Message> queueMessages) {
|
|
||||||
super(channel);
|
|
||||||
this.queueMessages = queueMessages;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void handleDelivery(
|
|
||||||
String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body)
|
|
||||||
throws IOException {
|
|
||||||
final String json = new String(body, StandardCharsets.UTF_8);
|
|
||||||
Message message = Message.fromJson(json);
|
|
||||||
try {
|
|
||||||
this.queueMessages.put(message);
|
|
||||||
System.out.println("Receiving Message " + message);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
if (message.getType() == MessageType.REPORT)
|
|
||||||
throw new RuntimeException("Error on sending message");
|
|
||||||
else {
|
|
||||||
// TODO LOGGING EXCEPTION
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
getChannel().basicAck(envelope.getDeliveryTag(), false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,136 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.message;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
|
||||||
import java.util.concurrent.TimeoutException;
|
|
||||||
|
|
||||||
import com.rabbitmq.client.Channel;
|
|
||||||
import com.rabbitmq.client.Connection;
|
|
||||||
import com.rabbitmq.client.ConnectionFactory;
|
|
||||||
|
|
||||||
public class MessageManager {
|
|
||||||
|
|
||||||
private final String messageHost;
|
|
||||||
|
|
||||||
private final String username;
|
|
||||||
|
|
||||||
private final String password;
|
|
||||||
|
|
||||||
private Connection connection;
|
|
||||||
|
|
||||||
private final Map<String, Channel> channels = new HashMap<>();
|
|
||||||
|
|
||||||
private boolean durable;
|
|
||||||
|
|
||||||
private boolean autodelete;
|
|
||||||
|
|
||||||
private final LinkedBlockingQueue<Message> queueMessages;
|
|
||||||
|
|
||||||
public MessageManager(
|
|
||||||
String messageHost,
|
|
||||||
String username,
|
|
||||||
String password,
|
|
||||||
final LinkedBlockingQueue<Message> queueMessages) {
|
|
||||||
this.queueMessages = queueMessages;
|
|
||||||
this.messageHost = messageHost;
|
|
||||||
this.username = username;
|
|
||||||
this.password = password;
|
|
||||||
}
|
|
||||||
|
|
||||||
public MessageManager(
|
|
||||||
String messageHost,
|
|
||||||
String username,
|
|
||||||
String password,
|
|
||||||
boolean durable,
|
|
||||||
boolean autodelete,
|
|
||||||
final LinkedBlockingQueue<Message> queueMessages) {
|
|
||||||
this.queueMessages = queueMessages;
|
|
||||||
this.messageHost = messageHost;
|
|
||||||
this.username = username;
|
|
||||||
this.password = password;
|
|
||||||
|
|
||||||
this.durable = durable;
|
|
||||||
this.autodelete = autodelete;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Connection createConnection() throws IOException, TimeoutException {
|
|
||||||
ConnectionFactory factory = new ConnectionFactory();
|
|
||||||
factory.setHost(this.messageHost);
|
|
||||||
factory.setUsername(this.username);
|
|
||||||
factory.setPassword(this.password);
|
|
||||||
return factory.newConnection();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Channel createChannel(
|
|
||||||
final Connection connection,
|
|
||||||
final String queueName,
|
|
||||||
final boolean durable,
|
|
||||||
final boolean autodelete)
|
|
||||||
throws Exception {
|
|
||||||
Map<String, Object> args = new HashMap<>();
|
|
||||||
args.put("x-message-ttl", 10000);
|
|
||||||
Channel channel = connection.createChannel();
|
|
||||||
channel.queueDeclare(queueName, durable, false, this.autodelete, args);
|
|
||||||
return channel;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete)
|
|
||||||
throws Exception {
|
|
||||||
if (channels.containsKey(queueName)) {
|
|
||||||
return channels.get(queueName);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.connection == null) {
|
|
||||||
this.connection = createConnection();
|
|
||||||
}
|
|
||||||
channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete));
|
|
||||||
return channels.get(queueName);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() throws IOException {
|
|
||||||
channels
|
|
||||||
.values()
|
|
||||||
.forEach(
|
|
||||||
ch -> {
|
|
||||||
try {
|
|
||||||
ch.close();
|
|
||||||
} catch (Exception e) {
|
|
||||||
// TODO LOG
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
this.connection.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean sendMessage(final Message message, String queueName) throws Exception {
|
|
||||||
try {
|
|
||||||
Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete);
|
|
||||||
channel.basicPublish("", queueName, null, message.toString().getBytes());
|
|
||||||
return true;
|
|
||||||
} catch (Throwable e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean sendMessage(
|
|
||||||
final Message message, String queueName, boolean durable_var, boolean autodelete_var)
|
|
||||||
throws Exception {
|
|
||||||
try {
|
|
||||||
Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var);
|
|
||||||
channel.basicPublish("", queueName, null, message.toString().getBytes());
|
|
||||||
return true;
|
|
||||||
} catch (Throwable e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void startConsumingMessage(
|
|
||||||
final String queueName, final boolean durable, final boolean autodelete) throws Exception {
|
|
||||||
|
|
||||||
Channel channel = createChannel(createConnection(), queueName, durable, autodelete);
|
|
||||||
channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages));
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,6 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.message;
|
|
||||||
|
|
||||||
public enum MessageType {
|
|
||||||
ONGOING, REPORT
|
|
||||||
}
|
|
|
@ -1,16 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.model.mdstore;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
public class MetadataRecordTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void getTimestamp() {
|
|
||||||
|
|
||||||
MetadataRecord r = new MetadataRecord();
|
|
||||||
assertTrue(r.getDateOfCollection() > 0);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,51 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.message;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
public class MessageTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void fromJsonTest() throws IOException {
|
|
||||||
Message m = new Message();
|
|
||||||
m.setWorkflowId("wId");
|
|
||||||
m.setType(MessageType.ONGOING);
|
|
||||||
m.setJobName("Collection");
|
|
||||||
Map<String, String> body = new HashMap<>();
|
|
||||||
body.put("parsedItem", "300");
|
|
||||||
body.put("ExecutionTime", "30s");
|
|
||||||
|
|
||||||
m.setBody(body);
|
|
||||||
System.out.println("m = " + m);
|
|
||||||
Message m1 = Message.fromJson(m.toString());
|
|
||||||
assertEquals(m1.getWorkflowId(), m.getWorkflowId());
|
|
||||||
assertEquals(m1.getType(), m.getType());
|
|
||||||
assertEquals(m1.getJobName(), m.getJobName());
|
|
||||||
|
|
||||||
assertNotNull(m1.getBody());
|
|
||||||
m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it)));
|
|
||||||
assertEquals(m1.getJobName(), m.getJobName());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void toStringTest() {
|
|
||||||
final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}";
|
|
||||||
Message m = new Message();
|
|
||||||
m.setWorkflowId("wId");
|
|
||||||
m.setType(MessageType.ONGOING);
|
|
||||||
m.setJobName("Collection");
|
|
||||||
Map<String, String> body = new HashMap<>();
|
|
||||||
body.put("parsedItem", "300");
|
|
||||||
body.put("ExecutionTime", "30s");
|
|
||||||
|
|
||||||
m.setBody(body);
|
|
||||||
|
|
||||||
assertEquals(expectedJson, m.toString());
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -67,6 +67,11 @@
|
||||||
<artifactId>guava</artifactId>
|
<artifactId>guava</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-codec</groupId>
|
||||||
|
<artifactId>commons-codec</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,17 +3,19 @@ package eu.dnetlib.dhp.schema.common;
|
||||||
|
|
||||||
import static com.google.common.base.Preconditions.checkArgument;
|
import static com.google.common.base.Preconditions.checkArgument;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.security.MessageDigest;
|
||||||
|
import java.security.NoSuchAlgorithmException;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.time.temporal.TemporalAccessor;
|
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.binary.Hex;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
@ -252,13 +254,6 @@ public class ModelSupport {
|
||||||
.setRelation("isRelatedTo")
|
.setRelation("isRelatedTo")
|
||||||
.setRelType("resultResult")
|
.setRelType("resultResult")
|
||||||
.setSubReltype("relationship"));
|
.setSubReltype("relationship"));
|
||||||
relationInverseMap
|
|
||||||
.put(
|
|
||||||
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
|
|
||||||
.setInverse("hasAmongTopNSimilarDocuments")
|
|
||||||
.setRelation("isAmongTopNSimilarDocuments")
|
|
||||||
.setRelType("resultResult")
|
|
||||||
.setSubReltype("similarity"));
|
|
||||||
relationInverseMap
|
relationInverseMap
|
||||||
.put(
|
.put(
|
||||||
"resultResult_supplement_isSupplementTo", new RelationInverse()
|
"resultResult_supplement_isSupplementTo", new RelationInverse()
|
||||||
|
@ -482,6 +477,20 @@ public class ModelSupport {
|
||||||
return ((OafEntity) t).getId();
|
return ((OafEntity) t).getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String md5(final String s) {
|
||||||
|
try {
|
||||||
|
final MessageDigest md = MessageDigest.getInstance("MD5");
|
||||||
|
md.update(s.getBytes(StandardCharsets.UTF_8));
|
||||||
|
return new String(Hex.encodeHex(md.digest()));
|
||||||
|
} catch (final NoSuchAlgorithmException e) {
|
||||||
|
throw new IllegalStateException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String generateIdentifier(final String originalId, final String nsPrefix) {
|
||||||
|
return String.format("%s::%s", nsPrefix, md5(originalId));
|
||||||
|
}
|
||||||
|
|
||||||
public static String oldest(String dateA, String dateB) throws ParseException {
|
public static String oldest(String dateA, String dateB) throws ParseException {
|
||||||
|
|
||||||
if (StringUtils.isBlank(dateA)) {
|
if (StringUtils.isBlank(dateA)) {
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.model.mdstore;
|
package eu.dnetlib.dhp.schema.mdstore;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
|
||||||
/** This class models a record inside the new Metadata store collection on HDFS * */
|
/**
|
||||||
|
* This class models a record in a Metadata store collection on HDFS
|
||||||
|
*/
|
||||||
public class MetadataRecord implements Serializable {
|
public class MetadataRecord implements Serializable {
|
||||||
|
|
||||||
/** The D-Net Identifier associated to the record */
|
/** The D-Net Identifier associated to the record */
|
||||||
|
@ -26,13 +28,13 @@ public class MetadataRecord implements Serializable {
|
||||||
private String body;
|
private String body;
|
||||||
|
|
||||||
/** the date when the record has been stored */
|
/** the date when the record has been stored */
|
||||||
private long dateOfCollection;
|
private Long dateOfCollection;
|
||||||
|
|
||||||
/** the date when the record has been stored */
|
/** the date when the record has been stored */
|
||||||
private long dateOfTransformation;
|
private Long dateOfTransformation;
|
||||||
|
|
||||||
public MetadataRecord() {
|
public MetadataRecord() {
|
||||||
this.dateOfCollection = System.currentTimeMillis();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public MetadataRecord(
|
public MetadataRecord(
|
||||||
|
@ -40,14 +42,14 @@ public class MetadataRecord implements Serializable {
|
||||||
String encoding,
|
String encoding,
|
||||||
Provenance provenance,
|
Provenance provenance,
|
||||||
String body,
|
String body,
|
||||||
long dateOfCollection) {
|
Long dateOfCollection) {
|
||||||
|
|
||||||
this.originalId = originalId;
|
this.originalId = originalId;
|
||||||
this.encoding = encoding;
|
this.encoding = encoding;
|
||||||
this.provenance = provenance;
|
this.provenance = provenance;
|
||||||
this.body = body;
|
this.body = body;
|
||||||
this.dateOfCollection = dateOfCollection;
|
this.dateOfCollection = dateOfCollection;
|
||||||
this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix());
|
this.id = ModelSupport.generateIdentifier(originalId, this.provenance.getNsPrefix());
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getId() {
|
public String getId() {
|
||||||
|
@ -90,19 +92,19 @@ public class MetadataRecord implements Serializable {
|
||||||
this.body = body;
|
this.body = body;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long getDateOfCollection() {
|
public Long getDateOfCollection() {
|
||||||
return dateOfCollection;
|
return dateOfCollection;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDateOfCollection(long dateOfCollection) {
|
public void setDateOfCollection(Long dateOfCollection) {
|
||||||
this.dateOfCollection = dateOfCollection;
|
this.dateOfCollection = dateOfCollection;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long getDateOfTransformation() {
|
public Long getDateOfTransformation() {
|
||||||
return dateOfTransformation;
|
return dateOfTransformation;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDateOfTransformation(long dateOfTransformation) {
|
public void setDateOfTransformation(Long dateOfTransformation) {
|
||||||
this.dateOfTransformation = dateOfTransformation;
|
this.dateOfTransformation = dateOfTransformation;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.model.mdstore;
|
package eu.dnetlib.dhp.schema.mdstore;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
|
@ -15,11 +15,11 @@ object OafUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
|
def generateDataInfo(trust: String = "0.9", invisible: Boolean = false): DataInfo = {
|
||||||
val di = new DataInfo
|
val di = new DataInfo
|
||||||
di.setDeletedbyinference(false)
|
di.setDeletedbyinference(false)
|
||||||
di.setInferred(false)
|
di.setInferred(false)
|
||||||
di.setInvisible(false)
|
di.setInvisible(invisible)
|
||||||
di.setTrust(trust)
|
di.setTrust(trust)
|
||||||
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
||||||
di
|
di
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
import java.time.format.DateTimeParseException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -94,7 +95,7 @@ public class MergeTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void mergeRelationTestParseException() {
|
public void mergeRelationTestParseException() {
|
||||||
assertThrows(IllegalArgumentException.class, () -> {
|
assertThrows(DateTimeParseException.class, () -> {
|
||||||
Relation a = createRel(true, "2016-04-05");
|
Relation a = createRel(true, "2016-04-05");
|
||||||
Relation b = createRel(true, "2016-04-05");
|
Relation b = createRel(true, "2016-04-05");
|
||||||
a.mergeFrom(b);
|
a.mergeFrom(b);
|
||||||
|
|
|
@ -1,29 +1,27 @@
|
||||||
Description of the Module
|
Description of the Module
|
||||||
--------------------------
|
--------------------------
|
||||||
This module defines a **collector worker application** that runs on Hadoop.
|
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
|
||||||
|
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
|
||||||
|
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
|
||||||
|
of each MDStore.
|
||||||
|
|
||||||
It is responsible for harvesting metadata using different plugins.
|
## Metadata collection
|
||||||
|
|
||||||
The collector worker uses a message queue to inform the progress
|
The **metadata collection workflow** is responsible for harvesting metadata records from different protocols and responding to
|
||||||
of the harvesting action (using a message queue for sending **ONGOING** messages) furthermore,
|
different formats and to store them as on HDFS so that they can be further processed.
|
||||||
It gives, at the end of the job, some information about the status
|
|
||||||
of the collection i.e Number of records collected(using a message queue for sending **REPORT** messages).
|
|
||||||
|
|
||||||
To work the collection worker need some parameter like:
|
### Collector Plugins
|
||||||
|
|
||||||
* **hdfsPath**: the path where storing the sequential file
|
Different protocols are managed by dedicated Collector plugins, i.e. java programs implementing a defined interface:
|
||||||
* **apidescriptor**: the JSON encoding of the API Descriptor
|
|
||||||
* **namenode**: the Name Node URI
|
|
||||||
* **userHDFS**: the user wich create the hdfs seq file
|
|
||||||
* **rabbitUser**: the user to connect with RabbitMq for messaging
|
|
||||||
* **rabbitPassWord**: the password to connect with RabbitMq for messaging
|
|
||||||
* **rabbitHost**: the host of the RabbitMq server
|
|
||||||
* **rabbitOngoingQueue**: the name of the ongoing queue
|
|
||||||
* **rabbitReportQueue**: the name of the report queue
|
|
||||||
* **workflowId**: the identifier of the dnet Workflow
|
|
||||||
|
|
||||||
##Plugins
|
```eu.dnetlib.dhp.collection.plugin.CollectorPlugin```
|
||||||
* OAI Plugin
|
|
||||||
|
The list of the supported plugins:
|
||||||
|
|
||||||
|
* OAI Plugin: collects from OAI-PMH compatible endpoints
|
||||||
|
* MDStore plugin: collects from a given D-Net MetadataStore, (identified by moogodb URI, dbName, MDStoreID)
|
||||||
|
* MDStore dump plugin: collects from an MDStore dump stored on the HDFS location indicated by the `path` parameter
|
||||||
|
|
||||||
|
# Transformation Plugins
|
||||||
|
TODO
|
||||||
|
|
||||||
## Usage
|
|
||||||
TODO
|
|
|
@ -7,10 +7,44 @@
|
||||||
<version>1.2.4-SNAPSHOT</version>
|
<version>1.2.4-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>dhp-aggregation</artifactId>
|
<artifactId>dhp-aggregation</artifactId>
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>net.alchim31.maven</groupId>
|
||||||
|
<artifactId>scala-maven-plugin</artifactId>
|
||||||
|
<version>${net.alchim31.maven.version}</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>scala-compile-first</id>
|
||||||
|
<phase>initialize</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
<goal>compile</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>scala-test-compile</id>
|
||||||
|
<phase>process-test-resources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>testCompile</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
<configuration>
|
||||||
|
<scalaVersion>${scala.version}</scalaVersion>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
|
||||||
|
</build>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
|
<artifactId>httpclient</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_2.11</artifactId>
|
||||||
|
@ -24,15 +58,9 @@
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-common</artifactId>
|
<artifactId>dhp-common</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
<exclusions>
|
|
||||||
<exclusion>
|
|
||||||
<groupId>com.sun.xml.bind</groupId>
|
|
||||||
<artifactId>jaxb-core</artifactId>
|
|
||||||
</exclusion>
|
|
||||||
</exclusions>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-schemas</artifactId>
|
<artifactId>dhp-schemas</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
|
@ -58,6 +86,11 @@
|
||||||
<artifactId>jaxen</artifactId>
|
<artifactId>jaxen</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.json</groupId>
|
||||||
|
<artifactId>json</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
|
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
|
@ -78,8 +111,11 @@
|
||||||
<artifactId>commons-compress</artifactId>
|
<artifactId>commons-compress</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.mongodb</groupId>
|
||||||
|
<artifactId>mongo-java-driver</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
|
@ -0,0 +1,83 @@
|
||||||
|
package eu.dnetlib.dhp.actionmanager.datacite
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest}
|
||||||
|
import org.apache.http.entity.StringEntity
|
||||||
|
import org.apache.http.impl.client.HttpClients
|
||||||
|
|
||||||
|
import java.io.IOException
|
||||||
|
|
||||||
|
abstract class AbstractRestClient extends Iterator[String]{
|
||||||
|
|
||||||
|
var buffer: List[String] = List()
|
||||||
|
var current_index:Int = 0
|
||||||
|
|
||||||
|
var scroll_value: Option[String] = None
|
||||||
|
|
||||||
|
var complete:Boolean = false
|
||||||
|
|
||||||
|
|
||||||
|
def extractInfo(input: String): Unit
|
||||||
|
|
||||||
|
protected def getBufferData(): Unit
|
||||||
|
|
||||||
|
|
||||||
|
def doHTTPGETRequest(url:String): String = {
|
||||||
|
val httpGet = new HttpGet(url)
|
||||||
|
doHTTPRequest(httpGet)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def doHTTPPOSTRequest(url:String, json:String): String = {
|
||||||
|
val httpPost = new HttpPost(url)
|
||||||
|
if (json != null) {
|
||||||
|
val entity = new StringEntity(json)
|
||||||
|
httpPost.setEntity(entity)
|
||||||
|
httpPost.setHeader("Accept", "application/json")
|
||||||
|
httpPost.setHeader("Content-type", "application/json")
|
||||||
|
}
|
||||||
|
doHTTPRequest(httpPost)
|
||||||
|
}
|
||||||
|
|
||||||
|
def hasNext: Boolean = {
|
||||||
|
buffer.nonEmpty && current_index < buffer.size
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
override def next(): String = {
|
||||||
|
val next_item:String = buffer(current_index)
|
||||||
|
current_index = current_index + 1
|
||||||
|
if (current_index == buffer.size)
|
||||||
|
getBufferData()
|
||||||
|
next_item
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={
|
||||||
|
val client = HttpClients.createDefault
|
||||||
|
try {
|
||||||
|
var tries = 4
|
||||||
|
while (tries > 0) {
|
||||||
|
val response = client.execute(r)
|
||||||
|
if (response.getStatusLine.getStatusCode > 400) {
|
||||||
|
tries -= 1
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return IOUtils.toString(response.getEntity.getContent)
|
||||||
|
}
|
||||||
|
""
|
||||||
|
} catch {
|
||||||
|
case e: Throwable =>
|
||||||
|
throw new RuntimeException("Error on executing request ", e)
|
||||||
|
} finally try client.close()
|
||||||
|
catch {
|
||||||
|
case e: IOException =>
|
||||||
|
throw new RuntimeException("Unable to close client ", e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
getBufferData()
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
package eu.dnetlib.dhp.actionmanager.datacite
|
||||||
|
|
||||||
|
import org.json4s.{DefaultFormats, JValue}
|
||||||
|
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||||
|
|
||||||
|
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10) extends AbstractRestClient {
|
||||||
|
|
||||||
|
override def extractInfo(input: String): Unit = {
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
lazy val json: org.json4s.JValue = parse(input)
|
||||||
|
buffer = (json \ "data").extract[List[JValue]].map(s => compact(render(s)))
|
||||||
|
val next_url = (json \ "links" \ "next").extractOrElse[String](null)
|
||||||
|
scroll_value = if (next_url != null && next_url.nonEmpty) Some(next_url) else None
|
||||||
|
if (scroll_value.isEmpty)
|
||||||
|
complete = true
|
||||||
|
current_index = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
override def getBufferData(): Unit = {
|
||||||
|
if (!complete) {
|
||||||
|
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20*]")
|
||||||
|
extractInfo(response)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,483 @@
|
||||||
|
package eu.dnetlib.dhp.actionmanager.datacite
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
|
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
|
import org.apache.commons.lang3.StringUtils
|
||||||
|
import org.json4s.DefaultFormats
|
||||||
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||||
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
|
||||||
|
import java.nio.charset.CodingErrorAction
|
||||||
|
import java.text.SimpleDateFormat
|
||||||
|
import java.time.LocalDate
|
||||||
|
import java.time.format.DateTimeFormatter
|
||||||
|
import java.util.{Date, Locale}
|
||||||
|
import java.util.regex.Pattern
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.io.{Codec, Source}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
case class DataciteType(doi:String,timestamp:Long,isActive:Boolean, json:String ){}
|
||||||
|
|
||||||
|
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
|
||||||
|
|
||||||
|
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
|
||||||
|
|
||||||
|
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||||
|
|
||||||
|
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
||||||
|
|
||||||
|
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||||
|
|
||||||
|
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
|
||||||
|
|
||||||
|
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||||
|
|
||||||
|
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
|
||||||
|
|
||||||
|
object DataciteToOAFTransformation {
|
||||||
|
|
||||||
|
implicit val codec: Codec = Codec("UTF-8")
|
||||||
|
codec.onMalformedInput(CodingErrorAction.REPLACE)
|
||||||
|
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private val PID_VOCABULARY = "dnet:pid_types"
|
||||||
|
val COBJ_VOCABULARY = "dnet:publication_resource"
|
||||||
|
val RESULT_VOCABULARY = "dnet:result_typologies"
|
||||||
|
val ACCESS_MODE_VOCABULARY = "dnet:access_modes"
|
||||||
|
val DOI_CLASS = "doi"
|
||||||
|
|
||||||
|
val TITLE_SCHEME = "dnet:dataCite_title"
|
||||||
|
val SUBJ_CLASS = "keywords"
|
||||||
|
val SUBJ_SCHEME = "dnet:subject_classification_typologies"
|
||||||
|
|
||||||
|
val j_filter:List[String] = {
|
||||||
|
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
|
||||||
|
s.lines.toList
|
||||||
|
}
|
||||||
|
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
val unknown_repository: HostedByMapType = HostedByMapType("openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18", "Unknown Repository", "Unknown Repository", Some(1.0F))
|
||||||
|
|
||||||
|
val dataInfo: DataInfo = generateDataInfo("0.9")
|
||||||
|
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("openaire____::9e3be59865b2c1c335d32dae2fe7b254", "Datacite")
|
||||||
|
|
||||||
|
val hostedByMap: Map[String, HostedByMapType] = {
|
||||||
|
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
lazy val json: org.json4s.JValue = parse(s)
|
||||||
|
json.extract[Map[String, HostedByMapType]]
|
||||||
|
}
|
||||||
|
|
||||||
|
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
|
||||||
|
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||||
|
|
||||||
|
val funder_regex:List[(Pattern, String)] = List(
|
||||||
|
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE),"40|corda__h2020::"),
|
||||||
|
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE),"40|corda_______::")
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
val Date_regex: List[Pattern] = List(
|
||||||
|
//Y-M-D
|
||||||
|
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||||
|
//M-D-Y
|
||||||
|
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
|
||||||
|
//D-M-Y
|
||||||
|
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
|
||||||
|
//Y
|
||||||
|
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_json(json:String):Boolean = {
|
||||||
|
j_filter.exists(f => json.contains(f))
|
||||||
|
}
|
||||||
|
|
||||||
|
def toActionSet(item:Oaf) :(String, String) = {
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
item match {
|
||||||
|
case dataset: OafDataset =>
|
||||||
|
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||||
|
a.setClazz(classOf[OafDataset])
|
||||||
|
a.setPayload(dataset)
|
||||||
|
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||||
|
case publication: Publication =>
|
||||||
|
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||||
|
a.setClazz(classOf[Publication])
|
||||||
|
a.setPayload(publication)
|
||||||
|
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||||
|
case software: Software =>
|
||||||
|
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||||
|
a.setClazz(classOf[Software])
|
||||||
|
a.setPayload(software)
|
||||||
|
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||||
|
case orp: OtherResearchProduct =>
|
||||||
|
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||||
|
a.setClazz(classOf[OtherResearchProduct])
|
||||||
|
a.setPayload(orp)
|
||||||
|
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||||
|
|
||||||
|
case relation: Relation =>
|
||||||
|
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||||
|
a.setClazz(classOf[Relation])
|
||||||
|
a.setPayload(relation)
|
||||||
|
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||||
|
case _ =>
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def embargo_end(embargo_end_date: String): Boolean = {
|
||||||
|
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||||
|
val td = LocalDate.now()
|
||||||
|
td.isAfter(dt)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_date(input: String): Option[String] = {
|
||||||
|
val d = Date_regex.map(pattern => {
|
||||||
|
val matcher = pattern.matcher(input)
|
||||||
|
if (matcher.find())
|
||||||
|
matcher.group(0)
|
||||||
|
else
|
||||||
|
null
|
||||||
|
}
|
||||||
|
).find(s => s != null)
|
||||||
|
|
||||||
|
if (d.isDefined) {
|
||||||
|
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
||||||
|
try {
|
||||||
|
return Some(LocalDate.parse(a_date, df_en).toString)
|
||||||
|
} catch {
|
||||||
|
case _: Throwable => try {
|
||||||
|
return Some(LocalDate.parse(a_date, df_it).toString)
|
||||||
|
} catch {
|
||||||
|
case _: Throwable => try {
|
||||||
|
return None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
d
|
||||||
|
}
|
||||||
|
|
||||||
|
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): (Qualifier, Qualifier) = {
|
||||||
|
if (resourceType != null && resourceType.nonEmpty) {
|
||||||
|
val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceType)
|
||||||
|
if (typeQualifier != null)
|
||||||
|
return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid))
|
||||||
|
}
|
||||||
|
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
||||||
|
val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, schemaOrg)
|
||||||
|
if (typeQualifier != null)
|
||||||
|
return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid))
|
||||||
|
|
||||||
|
}
|
||||||
|
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
||||||
|
val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceTypeGeneral)
|
||||||
|
if (typeQualifier != null)
|
||||||
|
return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid))
|
||||||
|
|
||||||
|
}
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): Result = {
|
||||||
|
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||||
|
if (typeQualifiers == null)
|
||||||
|
return null
|
||||||
|
val i = new Instance
|
||||||
|
i.setInstancetype(typeQualifiers._1)
|
||||||
|
typeQualifiers._2.getClassname match {
|
||||||
|
case "dataset" =>
|
||||||
|
val r = new OafDataset
|
||||||
|
r.setInstance(List(i).asJava)
|
||||||
|
return r
|
||||||
|
case "publication" =>
|
||||||
|
val r = new Publication
|
||||||
|
r.setInstance(List(i).asJava)
|
||||||
|
return r
|
||||||
|
case "software" =>
|
||||||
|
val r = new Software
|
||||||
|
r.setInstance(List(i).asJava)
|
||||||
|
return r
|
||||||
|
case "other" =>
|
||||||
|
val r = new OtherResearchProduct
|
||||||
|
r.setInstance(List(i).asJava)
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def available_date(input: String): Boolean = {
|
||||||
|
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
lazy val json: org.json4s.JValue = parse(input)
|
||||||
|
val l: List[String] = for {
|
||||||
|
JObject(dates) <- json \\ "dates"
|
||||||
|
JField("dateType", JString(dateTypes)) <- dates
|
||||||
|
} yield dateTypes
|
||||||
|
|
||||||
|
l.exists(p => p.equalsIgnoreCase("available"))
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
||||||
|
OafMapperUtils.structuredProperty(dt, q, null)
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateRelation(sourceId:String, targetId:String, relClass:String, cf:KeyValue, di:DataInfo) :Relation = {
|
||||||
|
|
||||||
|
val r = new Relation
|
||||||
|
r.setSource(sourceId)
|
||||||
|
r.setTarget(targetId)
|
||||||
|
r.setRelType("resultProject")
|
||||||
|
r.setRelClass(relClass)
|
||||||
|
r.setSubRelType("outcome")
|
||||||
|
r.setCollectedfrom(List(cf).asJava)
|
||||||
|
r.setDataInfo(di)
|
||||||
|
r
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_projectRelation(awardUri:String, sourceId:String):List[Relation] = {
|
||||||
|
val match_pattern = funder_regex.find(s =>s._1.matcher(awardUri).find())
|
||||||
|
|
||||||
|
if (match_pattern.isDefined) {
|
||||||
|
val m =match_pattern.get._1
|
||||||
|
val p = match_pattern.get._2
|
||||||
|
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||||
|
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||||
|
List(
|
||||||
|
generateRelation(sourceId, targetId,"isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
|
||||||
|
generateRelation(targetId, sourceId,"produces", DATACITE_COLLECTED_FROM, dataInfo)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
else
|
||||||
|
List()
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generateOAF(input:String,ts:Long, dateOfCollection:Long, vocabularies: VocabularyGroup):List[Oaf] = {
|
||||||
|
if (filter_json(input))
|
||||||
|
return List()
|
||||||
|
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
lazy val json = parse(input)
|
||||||
|
|
||||||
|
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
||||||
|
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||||
|
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
|
||||||
|
|
||||||
|
val doi = (json \ "attributes" \ "doi").extract[String]
|
||||||
|
if (doi.isEmpty)
|
||||||
|
return List()
|
||||||
|
|
||||||
|
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
|
||||||
|
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||||
|
if (result == null)
|
||||||
|
return List()
|
||||||
|
|
||||||
|
|
||||||
|
val doi_q = vocabularies.getSynonymAsQualifier(PID_VOCABULARY, "doi")
|
||||||
|
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
||||||
|
result.setPid(List(pid).asJava)
|
||||||
|
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
||||||
|
result.setOriginalId(List(doi).asJava)
|
||||||
|
|
||||||
|
val d = new Date(dateOfCollection*1000)
|
||||||
|
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||||
|
|
||||||
|
|
||||||
|
result.setDateofcollection(ISO8601FORMAT.format(d))
|
||||||
|
result.setDateoftransformation(ISO8601FORMAT.format(ts))
|
||||||
|
result.setDataInfo(dataInfo)
|
||||||
|
|
||||||
|
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
||||||
|
|
||||||
|
|
||||||
|
val authors = creators.zipWithIndex.map { case (c, idx) =>
|
||||||
|
val a = new Author
|
||||||
|
a.setFullname(c.name.orNull)
|
||||||
|
a.setName(c.givenName.orNull)
|
||||||
|
a.setSurname(c.familyName.orNull)
|
||||||
|
if (c.nameIdentifiers!= null&& c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
||||||
|
a.setPid(c.nameIdentifiers.get.map(ni => {
|
||||||
|
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(PID_VOCABULARY, ni.nameIdentifierScheme.get.toLowerCase()) else null
|
||||||
|
if (ni.nameIdentifier!= null && ni.nameIdentifier.isDefined) {
|
||||||
|
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
||||||
|
}
|
||||||
|
else
|
||||||
|
null
|
||||||
|
|
||||||
|
}
|
||||||
|
)
|
||||||
|
.asJava)
|
||||||
|
}
|
||||||
|
if (c.affiliation.isDefined)
|
||||||
|
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
|
||||||
|
a.setRank(idx + 1)
|
||||||
|
a
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
val titles:List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
||||||
|
|
||||||
|
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
|
||||||
|
if (t.titleType.isEmpty) {
|
||||||
|
OafMapperUtils.structuredProperty(t.title.get, "main title", "main title", TITLE_SCHEME, TITLE_SCHEME, null)
|
||||||
|
} else {
|
||||||
|
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, TITLE_SCHEME, TITLE_SCHEME, null)
|
||||||
|
}
|
||||||
|
}).asJava)
|
||||||
|
|
||||||
|
if(authors==null || authors.isEmpty || !authors.exists(a => a !=null))
|
||||||
|
return List()
|
||||||
|
result.setAuthor(authors.asJava)
|
||||||
|
|
||||||
|
val dates = (json \\ "dates").extract[List[DateType]]
|
||||||
|
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
|
||||||
|
|
||||||
|
val i_date = dates
|
||||||
|
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||||
|
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
||||||
|
.map(d => extract_date(d.date.get))
|
||||||
|
val a_date: Option[String] = dates
|
||||||
|
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
|
||||||
|
.map(d => extract_date(d.date.get))
|
||||||
|
.find(d => d != null && d.isDefined)
|
||||||
|
.map(d => d.get)
|
||||||
|
|
||||||
|
if (a_date.isDefined) {
|
||||||
|
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||||
|
}
|
||||||
|
if (i_date.isDefined && i_date.get.isDefined) {
|
||||||
|
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||||
|
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||||
|
}
|
||||||
|
else if (publication_year != null) {
|
||||||
|
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||||
|
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||||
|
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||||
|
.filter(d => d._1.isDefined)
|
||||||
|
.map(d => (d._1.get, vocabularies.getTermAsQualifier("dnet:dataCite_date", d._2.toLowerCase())))
|
||||||
|
.filter(d => d._2 != null)
|
||||||
|
.map(d => generateOAFDate(d._1, d._2)).asJava)
|
||||||
|
|
||||||
|
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
||||||
|
|
||||||
|
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
|
||||||
|
.map(s =>
|
||||||
|
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, SUBJ_SCHEME, SUBJ_SCHEME, null)
|
||||||
|
).asJava)
|
||||||
|
|
||||||
|
|
||||||
|
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||||
|
|
||||||
|
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
||||||
|
|
||||||
|
result.setDescription(
|
||||||
|
descriptions
|
||||||
|
.filter(d => d.description.isDefined).
|
||||||
|
map(d =>
|
||||||
|
OafMapperUtils.field(d.description.get, null)
|
||||||
|
).filter(s => s!=null).asJava)
|
||||||
|
|
||||||
|
|
||||||
|
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
||||||
|
if (publisher != null)
|
||||||
|
result.setPublisher(OafMapperUtils.field(publisher, null))
|
||||||
|
|
||||||
|
|
||||||
|
val language: String = (json \\ "language").extractOrElse[String](null)
|
||||||
|
|
||||||
|
if (language != null)
|
||||||
|
result.setLanguage(vocabularies.getSynonymAsQualifier("dnet:languages", language))
|
||||||
|
|
||||||
|
|
||||||
|
val instance = result.getInstance().get(0)
|
||||||
|
|
||||||
|
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
|
||||||
|
|
||||||
|
val accessRights:List[String] = for {
|
||||||
|
JObject(rightsList) <- json \\ "rightsList"
|
||||||
|
JField("rightsUri", JString(rightsUri)) <- rightsList
|
||||||
|
} yield rightsUri
|
||||||
|
|
||||||
|
val aRights: Option[Qualifier] = accessRights.map(r => {
|
||||||
|
vocabularies.getSynonymAsQualifier(ACCESS_MODE_VOCABULARY, r)
|
||||||
|
}).find(q => q != null)
|
||||||
|
|
||||||
|
|
||||||
|
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier("UNKNOWN", "not available", ACCESS_MODE_VOCABULARY, ACCESS_MODE_VOCABULARY)
|
||||||
|
|
||||||
|
if (client.isDefined) {
|
||||||
|
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
|
||||||
|
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
|
||||||
|
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
|
||||||
|
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
|
||||||
|
// instance.setAccessright(access_rights_qualifier)
|
||||||
|
|
||||||
|
//'http') and matches(., '.*(/licenses|/publicdomain|unlicense.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*')]">
|
||||||
|
val license = accessRights
|
||||||
|
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
|
||||||
|
if (license.isDefined)
|
||||||
|
instance.setLicense(OafMapperUtils.field(license.get, null))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
val awardUris:List[String] = for {
|
||||||
|
JObject(fundingReferences) <- json \\ "fundingReferences"
|
||||||
|
JField("awardUri", JString(awardUri)) <- fundingReferences
|
||||||
|
} yield awardUri
|
||||||
|
|
||||||
|
val relations:List[Relation] =awardUris.flatMap(a=> get_projectRelation(a, result.getId)).filter(r => r!= null)
|
||||||
|
|
||||||
|
if (relations!= null && relations.nonEmpty) {
|
||||||
|
List(result):::relations
|
||||||
|
}
|
||||||
|
else
|
||||||
|
List(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateDataInfo(trust: String): DataInfo = {
|
||||||
|
val di = new DataInfo
|
||||||
|
di.setDeletedbyinference(false)
|
||||||
|
di.setInferred(false)
|
||||||
|
di.setInvisible(false)
|
||||||
|
di.setTrust(trust)
|
||||||
|
di.setProvenanceaction(OafMapperUtils.qualifier("sysimport:actionset", "sysimport:actionset", "dnet:provenanceActions", "dnet:provenanceActions"))
|
||||||
|
di
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateDSId(input: String): String = {
|
||||||
|
val b = StringUtils.substringBefore(input, "::")
|
||||||
|
val a = StringUtils.substringAfter(input, "::")
|
||||||
|
s"10|$b::${DHPUtils.md5(a)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
package eu.dnetlib.dhp.actionmanager.datacite
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
|
import org.apache.hadoop.io.Text
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec
|
||||||
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
|
object ExportActionSetJobNode {
|
||||||
|
|
||||||
|
val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
|
||||||
|
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val conf = new SparkConf
|
||||||
|
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
|
||||||
|
parser.parseArgument(args)
|
||||||
|
val master = parser.get("master")
|
||||||
|
val sourcePath = parser.get("sourcePath")
|
||||||
|
val targetPath = parser.get("targetPath")
|
||||||
|
|
||||||
|
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||||
|
.appName(ExportActionSetJobNode.getClass.getSimpleName)
|
||||||
|
.master(master)
|
||||||
|
.getOrCreate()
|
||||||
|
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
|
implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
|
||||||
|
|
||||||
|
spark.read.load(sourcePath).as[Oaf]
|
||||||
|
.map(o =>DataciteToOAFTransformation.toActionSet(o))
|
||||||
|
.filter(o => o!= null)
|
||||||
|
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
package eu.dnetlib.dhp.actionmanager.datacite
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
|
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
|
object GenerateDataciteDatasetSpark {
|
||||||
|
|
||||||
|
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
|
||||||
|
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val conf = new SparkConf
|
||||||
|
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
|
||||||
|
parser.parseArgument(args)
|
||||||
|
val master = parser.get("master")
|
||||||
|
val sourcePath = parser.get("sourcePath")
|
||||||
|
val targetPath = parser.get("targetPath")
|
||||||
|
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||||
|
log.info("isLookupUrl: {}", isLookupUrl)
|
||||||
|
|
||||||
|
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||||
|
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||||
|
log.info(s"vocabulary size is ${vocabularies.getTerms("dnet:languages").size()}")
|
||||||
|
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||||
|
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
|
||||||
|
.master(master)
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
|
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
|
||||||
|
|
||||||
|
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
|
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
|
spark.read.load(sourcePath).as[DataciteType]
|
||||||
|
.filter(d => d.isActive)
|
||||||
|
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies))
|
||||||
|
.filter(d => d != null)
|
||||||
|
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,169 @@
|
||||||
|
package eu.dnetlib.dhp.actionmanager.datacite
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
|
||||||
|
import org.apache.hadoop.hdfs.DistributedFileSystem
|
||||||
|
import org.apache.hadoop.io.{IntWritable, SequenceFile, Text}
|
||||||
|
import org.apache.spark.SparkContext
|
||||||
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
|
import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
|
||||||
|
import org.json4s.DefaultFormats
|
||||||
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
import org.apache.spark.sql.functions.max
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
import java.time.format.DateTimeFormatter._
|
||||||
|
import java.time.{LocalDateTime, ZoneOffset}
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
|
object ImportDatacite {
|
||||||
|
|
||||||
|
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
|
||||||
|
|
||||||
|
|
||||||
|
def convertAPIStringToDataciteItem(input:String): DataciteType = {
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
lazy val json: org.json4s.JValue = parse(input)
|
||||||
|
val doi = (json \ "attributes" \ "doi").extract[String].toLowerCase
|
||||||
|
|
||||||
|
val isActive = (json \ "attributes" \ "isActive").extract[Boolean]
|
||||||
|
|
||||||
|
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
|
||||||
|
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
|
||||||
|
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli/1000, isActive = isActive, json = input)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
|
||||||
|
parser.parseArgument(args)
|
||||||
|
val master = parser.get("master")
|
||||||
|
|
||||||
|
val hdfsuri = parser.get("namenode")
|
||||||
|
log.info(s"namenode is $hdfsuri")
|
||||||
|
|
||||||
|
val targetPath = parser.get("targetPath")
|
||||||
|
log.info(s"targetPath is $targetPath")
|
||||||
|
|
||||||
|
val dataciteDump = parser.get("dataciteDumpPath")
|
||||||
|
log.info(s"dataciteDump is $dataciteDump")
|
||||||
|
|
||||||
|
val hdfsTargetPath =new Path(targetPath)
|
||||||
|
log.info(s"hdfsTargetPath is $hdfsTargetPath")
|
||||||
|
|
||||||
|
val spark: SparkSession = SparkSession.builder()
|
||||||
|
.appName(ImportDatacite.getClass.getSimpleName)
|
||||||
|
.master(master)
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
|
// ====== Init HDFS File System Object
|
||||||
|
val conf = new Configuration
|
||||||
|
// Set FileSystem URI
|
||||||
|
conf.set("fs.defaultFS", hdfsuri)
|
||||||
|
|
||||||
|
// Because of Maven
|
||||||
|
conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName)
|
||||||
|
conf.set("fs.file.impl", classOf[LocalFileSystem].getName)
|
||||||
|
val sc:SparkContext = spark.sparkContext
|
||||||
|
sc.setLogLevel("ERROR")
|
||||||
|
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
|
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
|
||||||
|
|
||||||
|
override def zero: DataciteType = null
|
||||||
|
|
||||||
|
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
|
||||||
|
if (b == null)
|
||||||
|
return a
|
||||||
|
if (a == null)
|
||||||
|
return b
|
||||||
|
if(a.timestamp >b.timestamp) {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
b
|
||||||
|
}
|
||||||
|
|
||||||
|
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
|
||||||
|
reduce(a,b)
|
||||||
|
}
|
||||||
|
|
||||||
|
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||||
|
|
||||||
|
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||||
|
|
||||||
|
override def finish(reduction: DataciteType): DataciteType = reduction
|
||||||
|
}
|
||||||
|
|
||||||
|
val dump:Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
|
||||||
|
val ts = dump.select(max("timestamp")).first().getLong(0)
|
||||||
|
|
||||||
|
log.info(s"last Timestamp is $ts")
|
||||||
|
|
||||||
|
val cnt = writeSequenceFile(hdfsTargetPath, ts, conf)
|
||||||
|
|
||||||
|
|
||||||
|
log.info(s"Imported from Datacite API $cnt documents")
|
||||||
|
|
||||||
|
if (cnt > 0) {
|
||||||
|
|
||||||
|
val inputRdd:RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
|
||||||
|
.map(s => s._2.toString)
|
||||||
|
.map(s => convertAPIStringToDataciteItem(s))
|
||||||
|
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
|
||||||
|
|
||||||
|
val ds:Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType]
|
||||||
|
|
||||||
|
dump
|
||||||
|
.union(ds)
|
||||||
|
.groupByKey(_.doi)
|
||||||
|
.agg(dataciteAggregator.toColumn)
|
||||||
|
.map(s=>s._2)
|
||||||
|
.repartition(4000)
|
||||||
|
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
|
||||||
|
|
||||||
|
val fs = FileSystem.get(sc.hadoopConfiguration)
|
||||||
|
fs.delete(new Path(s"$dataciteDump"), true)
|
||||||
|
fs.rename(new Path(s"${dataciteDump}_updated"),new Path(s"$dataciteDump"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration):Long = {
|
||||||
|
val client = new DataciteAPIImporter(timestamp*1000, 1000)
|
||||||
|
var i = 0
|
||||||
|
try {
|
||||||
|
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
|
||||||
|
try {
|
||||||
|
|
||||||
|
var start: Long = System.currentTimeMillis
|
||||||
|
var end: Long = 0
|
||||||
|
val key: IntWritable = new IntWritable(i)
|
||||||
|
val value: Text = new Text
|
||||||
|
while ( {
|
||||||
|
client.hasNext
|
||||||
|
}) {
|
||||||
|
key.set({
|
||||||
|
i += 1;
|
||||||
|
i - 1
|
||||||
|
})
|
||||||
|
value.set(client.next())
|
||||||
|
writer.append(key, value)
|
||||||
|
writer.hflush()
|
||||||
|
if (i % 1000 == 0) {
|
||||||
|
end = System.currentTimeMillis
|
||||||
|
val time = (end - start) / 1000.0F
|
||||||
|
println(s"Imported $i in $time seconds")
|
||||||
|
start = System.currentTimeMillis
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally if (writer != null) writer.close()
|
||||||
|
}
|
||||||
|
i
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,20 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
|
|
||||||
|
|
||||||
import java.util.LinkedList;
|
|
||||||
|
|
||||||
public class CollectorPluginErrorLogList extends LinkedList<String> {
|
|
||||||
|
|
||||||
private static final long serialVersionUID = -6925786561303289704L;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
String log = new String();
|
|
||||||
int index = 0;
|
|
||||||
for (String errorMessage : this) {
|
|
||||||
log += String.format("Retry #%s: %s / ", index++, errorMessage);
|
|
||||||
}
|
|
||||||
return log;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,20 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
|
|
||||||
|
|
||||||
public class CollectorServiceException extends Exception {
|
|
||||||
|
|
||||||
private static final long serialVersionUID = 7523999812098059764L;
|
|
||||||
|
|
||||||
public CollectorServiceException(String string) {
|
|
||||||
super(string);
|
|
||||||
}
|
|
||||||
|
|
||||||
public CollectorServiceException(String string, Throwable exception) {
|
|
||||||
super(string, exception);
|
|
||||||
}
|
|
||||||
|
|
||||||
public CollectorServiceException(Throwable exception) {
|
|
||||||
super(exception);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,240 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.net.*;
|
|
||||||
import java.security.GeneralSecurityException;
|
|
||||||
import java.security.cert.X509Certificate;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import javax.net.ssl.HttpsURLConnection;
|
|
||||||
import javax.net.ssl.SSLContext;
|
|
||||||
import javax.net.ssl.TrustManager;
|
|
||||||
import javax.net.ssl.X509TrustManager;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
|
||||||
import org.apache.commons.logging.Log;
|
|
||||||
import org.apache.commons.logging.LogFactory;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author jochen, michele, andrea
|
|
||||||
*/
|
|
||||||
public class HttpConnector {
|
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
|
||||||
|
|
||||||
private int maxNumberOfRetry = 6;
|
|
||||||
private int defaultDelay = 120; // seconds
|
|
||||||
private int readTimeOut = 120; // seconds
|
|
||||||
|
|
||||||
private String responseType = null;
|
|
||||||
|
|
||||||
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
|
||||||
|
|
||||||
public HttpConnector() {
|
|
||||||
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given the URL returns the content via HTTP GET
|
|
||||||
*
|
|
||||||
* @param requestUrl the URL
|
|
||||||
* @return the content of the downloaded resource
|
|
||||||
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
|
||||||
*/
|
|
||||||
public String getInputSource(final String requestUrl) throws CollectorServiceException {
|
|
||||||
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given the URL returns the content as a stream via HTTP GET
|
|
||||||
*
|
|
||||||
* @param requestUrl the URL
|
|
||||||
* @return the content of the downloaded resource as InputStream
|
|
||||||
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
|
||||||
*/
|
|
||||||
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
|
|
||||||
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
|
||||||
}
|
|
||||||
|
|
||||||
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber,
|
|
||||||
final CollectorPluginErrorLogList errorList)
|
|
||||||
throws CollectorServiceException {
|
|
||||||
try {
|
|
||||||
InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
|
||||||
try {
|
|
||||||
return IOUtils.toString(s);
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
|
||||||
Thread.sleep(defaultDelay * 1000);
|
|
||||||
errorList.add(e.getMessage());
|
|
||||||
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
|
||||||
} finally {
|
|
||||||
IOUtils.closeQuietly(s);
|
|
||||||
}
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
throw new CollectorServiceException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
|
|
||||||
final CollectorPluginErrorLogList errorList)
|
|
||||||
throws CollectorServiceException {
|
|
||||||
|
|
||||||
if (retryNumber > maxNumberOfRetry) {
|
|
||||||
throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList);
|
|
||||||
}
|
|
||||||
|
|
||||||
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
|
||||||
try {
|
|
||||||
InputStream input = null;
|
|
||||||
|
|
||||||
try {
|
|
||||||
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
|
||||||
urlConn.setInstanceFollowRedirects(false);
|
|
||||||
urlConn.setReadTimeout(readTimeOut * 1000);
|
|
||||||
urlConn.addRequestProperty("User-Agent", userAgent);
|
|
||||||
|
|
||||||
if (log.isDebugEnabled()) {
|
|
||||||
logHeaderFields(urlConn);
|
|
||||||
}
|
|
||||||
|
|
||||||
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
|
||||||
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
|
|
||||||
log.warn("waiting and repeating request after " + retryAfter + " sec.");
|
|
||||||
Thread.sleep(retryAfter * 1000);
|
|
||||||
errorList.add("503 Service Unavailable");
|
|
||||||
urlConn.disconnect();
|
|
||||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
|
||||||
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM)
|
|
||||||
|| (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) {
|
|
||||||
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
|
||||||
log.debug("The requested url has been moved to " + newUrl);
|
|
||||||
errorList
|
|
||||||
.add(
|
|
||||||
String
|
|
||||||
.format(
|
|
||||||
"%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(),
|
|
||||||
newUrl));
|
|
||||||
urlConn.disconnect();
|
|
||||||
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
|
||||||
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
|
||||||
log
|
|
||||||
.error(
|
|
||||||
String
|
|
||||||
.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
|
||||||
Thread.sleep(defaultDelay * 1000);
|
|
||||||
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
|
||||||
urlConn.disconnect();
|
|
||||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
|
||||||
} else {
|
|
||||||
input = urlConn.getInputStream();
|
|
||||||
responseType = urlConn.getContentType();
|
|
||||||
return input;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
|
||||||
Thread.sleep(defaultDelay * 1000);
|
|
||||||
errorList.add(e.getMessage());
|
|
||||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
|
||||||
}
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
throw new CollectorServiceException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
|
||||||
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
|
||||||
|
|
||||||
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
|
||||||
if (e.getKey() != null) {
|
|
||||||
for (String v : e.getValue()) {
|
|
||||||
log.debug(" key: " + e.getKey() + " - value: " + v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
|
||||||
for (String key : headerMap.keySet()) {
|
|
||||||
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0)
|
|
||||||
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
|
|
||||||
return Integer
|
|
||||||
.parseInt(headerMap.get(key).get(0)) + 10;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
|
|
||||||
for (String key : headerMap.keySet()) {
|
|
||||||
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) {
|
|
||||||
return headerMap.get(key).get(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
|
||||||
*/
|
|
||||||
public void initTrustManager() {
|
|
||||||
final X509TrustManager tm = new X509TrustManager() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public X509Certificate[] getAcceptedIssuers() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
try {
|
|
||||||
final SSLContext ctx = SSLContext.getInstance("TLS");
|
|
||||||
ctx.init(null, new TrustManager[] {
|
|
||||||
tm
|
|
||||||
}, null);
|
|
||||||
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
|
||||||
} catch (GeneralSecurityException e) {
|
|
||||||
log.fatal(e);
|
|
||||||
throw new IllegalStateException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getMaxNumberOfRetry() {
|
|
||||||
return maxNumberOfRetry;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
|
||||||
this.maxNumberOfRetry = maxNumberOfRetry;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getDefaultDelay() {
|
|
||||||
return defaultDelay;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDefaultDelay(final int defaultDelay) {
|
|
||||||
this.defaultDelay = defaultDelay;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getReadTimeOut() {
|
|
||||||
return readTimeOut;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setReadTimeOut(final int readTimeOut) {
|
|
||||||
this.readTimeOut = readTimeOut;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getResponseType() {
|
|
||||||
return responseType;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -17,8 +17,8 @@ import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.collection.HttpConnector2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Applies the parsing of a csv file and writes the Serialization of it in hdfs
|
* Applies the parsing of a csv file and writes the Serialization of it in hdfs
|
||||||
|
@ -74,7 +74,7 @@ public class ReadCSV implements Closeable {
|
||||||
throws Exception {
|
throws Exception {
|
||||||
this.conf = new Configuration();
|
this.conf = new Configuration();
|
||||||
this.conf.set("fs.defaultFS", hdfsNameNode);
|
this.conf.set("fs.defaultFS", hdfsNameNode);
|
||||||
HttpConnector httpConnector = new HttpConnector();
|
HttpConnector2 httpConnector = new HttpConnector2();
|
||||||
FileSystem fileSystem = FileSystem.get(this.conf);
|
FileSystem fileSystem = FileSystem.get(this.conf);
|
||||||
Path hdfsWritePath = new Path(hdfsPath);
|
Path hdfsWritePath = new Path(hdfsPath);
|
||||||
FSDataOutputStream fsDataOutputStream = null;
|
FSDataOutputStream fsDataOutputStream = null;
|
||||||
|
|
|
@ -14,13 +14,12 @@ import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.collection.HttpConnector2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Applies the parsing of an excel file and writes the Serialization of it in hdfs
|
* Applies the parsing of an excel file and writes the Serialization of it in hdfs
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class ReadExcel implements Closeable {
|
public class ReadExcel implements Closeable {
|
||||||
private static final Log log = LogFactory.getLog(ReadCSV.class);
|
private static final Log log = LogFactory.getLog(ReadCSV.class);
|
||||||
private final Configuration conf;
|
private final Configuration conf;
|
||||||
|
@ -72,7 +71,7 @@ public class ReadExcel implements Closeable {
|
||||||
throws Exception {
|
throws Exception {
|
||||||
this.conf = new Configuration();
|
this.conf = new Configuration();
|
||||||
this.conf.set("fs.defaultFS", hdfsNameNode);
|
this.conf.set("fs.defaultFS", hdfsNameNode);
|
||||||
HttpConnector httpConnector = new HttpConnector();
|
HttpConnector2 httpConnector = new HttpConnector2();
|
||||||
FileSystem fileSystem = FileSystem.get(this.conf);
|
FileSystem fileSystem = FileSystem.get(this.conf);
|
||||||
Path hdfsWritePath = new Path(hdfsPath);
|
Path hdfsWritePath = new Path(hdfsPath);
|
||||||
FSDataOutputStream fsDataOutputStream = null;
|
FSDataOutputStream fsDataOutputStream = null;
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.aggregation.common;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
|
||||||
|
public class AggregationCounter implements Serializable {
|
||||||
|
private LongAccumulator totalItems;
|
||||||
|
private LongAccumulator errorItems;
|
||||||
|
private LongAccumulator processedItems;
|
||||||
|
|
||||||
|
public AggregationCounter() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) {
|
||||||
|
this.totalItems = totalItems;
|
||||||
|
this.errorItems = errorItems;
|
||||||
|
this.processedItems = processedItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public LongAccumulator getTotalItems() {
|
||||||
|
return totalItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTotalItems(LongAccumulator totalItems) {
|
||||||
|
this.totalItems = totalItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public LongAccumulator getErrorItems() {
|
||||||
|
return errorItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setErrorItems(LongAccumulator errorItems) {
|
||||||
|
this.errorItems = errorItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public LongAccumulator getProcessedItems() {
|
||||||
|
return processedItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setProcessedItems(LongAccumulator processedItems) {
|
||||||
|
this.processedItems = processedItems;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,47 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.aggregation.common;
|
||||||
|
|
||||||
|
import java.io.Closeable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.message.MessageSender;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
|
public class AggregatorReport extends LinkedHashMap<String, String> implements Closeable {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class);
|
||||||
|
|
||||||
|
private MessageSender messageSender;
|
||||||
|
|
||||||
|
public AggregatorReport() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public AggregatorReport(MessageSender messageSender) throws IOException {
|
||||||
|
this.messageSender = messageSender;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void ongoing(Long current, Long total) {
|
||||||
|
messageSender.sendMessage(current, total);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
if (Objects.nonNull(messageSender)) {
|
||||||
|
log.info("closing report: ");
|
||||||
|
this.forEach((k, v) -> log.info("{} - {}", k, v));
|
||||||
|
|
||||||
|
Map<String, String> m = new HashMap<>();
|
||||||
|
m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values()));
|
||||||
|
messageSender.sendReport(m);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,10 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.aggregation.common;
|
||||||
|
|
||||||
|
public interface ReporterCallback {
|
||||||
|
|
||||||
|
Long getCurrent();
|
||||||
|
|
||||||
|
Long getTotal();
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.aggregation.common;
|
||||||
|
|
||||||
|
import java.util.TimerTask;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
public abstract class ReportingJob {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Frequency (seconds) for sending ongoing messages to report the collection task advancement
|
||||||
|
*/
|
||||||
|
public static final int ONGOING_REPORT_FREQUENCY = 5;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initial delay (seconds) for sending ongoing messages to report the collection task advancement
|
||||||
|
*/
|
||||||
|
public static final int INITIAL_DELAY = 2;
|
||||||
|
|
||||||
|
private ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
|
||||||
|
|
||||||
|
protected final AggregatorReport report;
|
||||||
|
|
||||||
|
public ReportingJob(AggregatorReport report) {
|
||||||
|
this.report = report;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void schedule(final ReporterCallback callback) {
|
||||||
|
executor.scheduleAtFixedRate(new TimerTask() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
report.ongoing(callback.getCurrent(), callback.getTotal());
|
||||||
|
}
|
||||||
|
}, INITIAL_DELAY, ONGOING_REPORT_FREQUENCY, TimeUnit.SECONDS);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void shutdown() {
|
||||||
|
executor.shutdown();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,136 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.aggregation.mdstore;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.Constants.*;
|
||||||
|
import static eu.dnetlib.dhp.utils.DHPUtils.*;
|
||||||
|
|
||||||
|
import java.net.URI;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.rest.DNetRestClient;
|
||||||
|
|
||||||
|
public class MDStoreActionNode {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class);
|
||||||
|
|
||||||
|
enum MDAction {
|
||||||
|
NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion";
|
||||||
|
|
||||||
|
public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s";
|
||||||
|
public static final String ROLLBACK_VERSION_URL = "%s/version/%s/abort";
|
||||||
|
|
||||||
|
public static final String READ_LOCK_URL = "%s/mdstore/%s/startReading";
|
||||||
|
public static final String READ_UNLOCK_URL = "%s/version/%s/endReading";
|
||||||
|
|
||||||
|
private static final String MDSTOREVERSIONPARAM = "mdStoreVersion";
|
||||||
|
private static final String MDSTOREREADLOCKPARAM = "mdStoreReadLockVersion";
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
MDStoreActionNode.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/collection/mdstore_action_parameters.json")));
|
||||||
|
argumentParser.parseArgument(args);
|
||||||
|
|
||||||
|
log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024));
|
||||||
|
|
||||||
|
final MDAction action = MDAction.valueOf(argumentParser.get("action"));
|
||||||
|
log.info("Current action is {}", action);
|
||||||
|
|
||||||
|
final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI");
|
||||||
|
log.info("mdStoreManagerURI is {}", mdStoreManagerURI);
|
||||||
|
|
||||||
|
switch (action) {
|
||||||
|
case NEW_VERSION: {
|
||||||
|
final String mdStoreID = argumentParser.get("mdStoreID");
|
||||||
|
if (StringUtils.isBlank(mdStoreID)) {
|
||||||
|
throw new IllegalArgumentException("missing or empty argument mdStoreId");
|
||||||
|
}
|
||||||
|
final MDStoreVersion currentVersion = DNetRestClient
|
||||||
|
.doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
|
||||||
|
populateOOZIEEnv(MDSTOREVERSIONPARAM, MAPPER.writeValueAsString(currentVersion));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case COMMIT: {
|
||||||
|
|
||||||
|
final String hdfsuri = argumentParser.get("namenode");
|
||||||
|
if (StringUtils.isBlank(hdfsuri)) {
|
||||||
|
throw new IllegalArgumentException("missing or empty argument namenode");
|
||||||
|
}
|
||||||
|
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
|
||||||
|
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
|
||||||
|
}
|
||||||
|
Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + MDSTORE_SIZE_PATH);
|
||||||
|
|
||||||
|
try (
|
||||||
|
FileSystem fs = FileSystem.get(URI.create(hdfsuri), getHadoopConfiguration(hdfsuri));
|
||||||
|
FSDataInputStream inputStream = fs.open(hdfstoreSizepath)) {
|
||||||
|
|
||||||
|
final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream));
|
||||||
|
|
||||||
|
fs.create(hdfstoreSizepath);
|
||||||
|
DNetRestClient
|
||||||
|
.doGET(
|
||||||
|
String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize));
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case ROLLBACK: {
|
||||||
|
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
|
||||||
|
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
|
||||||
|
}
|
||||||
|
DNetRestClient.doGET(String.format(ROLLBACK_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId()));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case READ_LOCK: {
|
||||||
|
final String mdStoreID = argumentParser.get("mdStoreID");
|
||||||
|
if (StringUtils.isBlank(mdStoreID)) {
|
||||||
|
throw new IllegalArgumentException("missing or empty argument mdStoreId");
|
||||||
|
}
|
||||||
|
final MDStoreVersion currentVersion = DNetRestClient
|
||||||
|
.doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
|
||||||
|
populateOOZIEEnv(MDSTOREREADLOCKPARAM, MAPPER.writeValueAsString(currentVersion));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case READ_UNLOCK: {
|
||||||
|
final String mdStoreVersion_params = argumentParser.get("readMDStoreId");
|
||||||
|
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
|
||||||
|
}
|
||||||
|
DNetRestClient.doGET(String.format(READ_UNLOCK_URL, mdStoreManagerURI, mdStoreVersion.getId()));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("invalid action");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,16 +1,16 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.worker;
|
package eu.dnetlib.dhp.collection;
|
||||||
|
|
||||||
public class DnetCollectorException extends Exception {
|
public class CollectorException extends Exception {
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
private static final long serialVersionUID = -290723075076039757L;
|
private static final long serialVersionUID = -290723075076039757L;
|
||||||
|
|
||||||
public DnetCollectorException() {
|
public CollectorException() {
|
||||||
super();
|
super();
|
||||||
}
|
}
|
||||||
|
|
||||||
public DnetCollectorException(
|
public CollectorException(
|
||||||
final String message,
|
final String message,
|
||||||
final Throwable cause,
|
final Throwable cause,
|
||||||
final boolean enableSuppression,
|
final boolean enableSuppression,
|
||||||
|
@ -18,15 +18,15 @@ public class DnetCollectorException extends Exception {
|
||||||
super(message, cause, enableSuppression, writableStackTrace);
|
super(message, cause, enableSuppression, writableStackTrace);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DnetCollectorException(final String message, final Throwable cause) {
|
public CollectorException(final String message, final Throwable cause) {
|
||||||
super(message, cause);
|
super(message, cause);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DnetCollectorException(final String message) {
|
public CollectorException(final String message) {
|
||||||
super(message);
|
super(message);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DnetCollectorException(final Throwable cause) {
|
public CollectorException(final Throwable cause) {
|
||||||
super(cause);
|
super(cause);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,134 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.Constants.SEQUENCE_FILE_NAME;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.io.IntWritable;
|
||||||
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.DeflateCodec;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
|
||||||
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
|
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
|
||||||
|
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
|
||||||
|
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
|
||||||
|
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
|
||||||
|
|
||||||
|
public class CollectorWorker extends ReportingJob {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class);
|
||||||
|
|
||||||
|
private final ApiDescriptor api;
|
||||||
|
|
||||||
|
private final FileSystem fileSystem;
|
||||||
|
|
||||||
|
private final MDStoreVersion mdStoreVersion;
|
||||||
|
|
||||||
|
private final HttpClientParams clientParams;
|
||||||
|
|
||||||
|
public CollectorWorker(
|
||||||
|
final ApiDescriptor api,
|
||||||
|
final FileSystem fileSystem,
|
||||||
|
final MDStoreVersion mdStoreVersion,
|
||||||
|
final HttpClientParams clientParams,
|
||||||
|
final AggregatorReport report) {
|
||||||
|
super(report);
|
||||||
|
this.api = api;
|
||||||
|
this.fileSystem = fileSystem;
|
||||||
|
this.mdStoreVersion = mdStoreVersion;
|
||||||
|
this.clientParams = clientParams;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
|
||||||
|
|
||||||
|
final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
|
||||||
|
log.info("outputPath path is {}", outputPath);
|
||||||
|
|
||||||
|
final CollectorPlugin plugin = getCollectorPlugin();
|
||||||
|
final AtomicInteger counter = new AtomicInteger(0);
|
||||||
|
|
||||||
|
scheduleReport(counter);
|
||||||
|
|
||||||
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
|
.createWriter(
|
||||||
|
fileSystem.getConf(),
|
||||||
|
SequenceFile.Writer.file(new Path(outputPath)),
|
||||||
|
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||||
|
SequenceFile.Writer.valueClass(Text.class),
|
||||||
|
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
||||||
|
final IntWritable key = new IntWritable(counter.get());
|
||||||
|
final Text value = new Text();
|
||||||
|
plugin
|
||||||
|
.collect(api, report)
|
||||||
|
.forEach(
|
||||||
|
content -> {
|
||||||
|
key.set(counter.getAndIncrement());
|
||||||
|
value.set(content);
|
||||||
|
try {
|
||||||
|
writer.append(key, value);
|
||||||
|
} catch (Throwable e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (Throwable e) {
|
||||||
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
|
throw new CollectorException(e);
|
||||||
|
} finally {
|
||||||
|
shutdown();
|
||||||
|
report.ongoing(counter.longValue(), counter.longValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void scheduleReport(AtomicInteger counter) {
|
||||||
|
schedule(new ReporterCallback() {
|
||||||
|
@Override
|
||||||
|
public Long getCurrent() {
|
||||||
|
return counter.longValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Long getTotal() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
|
||||||
|
|
||||||
|
switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
|
||||||
|
case oai:
|
||||||
|
return new OaiCollectorPlugin(clientParams);
|
||||||
|
case rest_json2xml:
|
||||||
|
return new RestCollectorPlugin(clientParams);
|
||||||
|
case other:
|
||||||
|
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
|
||||||
|
.ofNullable(api.getParams().get("other_plugin_type"))
|
||||||
|
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
|
||||||
|
.get();
|
||||||
|
|
||||||
|
switch (plugin) {
|
||||||
|
case mdstore_mongodb_dump:
|
||||||
|
return new MongoDbDumpCollectorPlugin(fileSystem);
|
||||||
|
case mdstore_mongodb:
|
||||||
|
return new MDStoreCollectorPlugin();
|
||||||
|
default:
|
||||||
|
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,135 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.Constants.*;
|
||||||
|
import static eu.dnetlib.dhp.utils.DHPUtils.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.message.MessageSender;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes
|
||||||
|
* into HDFS. This application will be executed on the hadoop cluster, where invoked in the context of the metadata collection
|
||||||
|
* oozie workflow, it will receive all the input parameters necessary to instantiate the specific collection plugin and the
|
||||||
|
* relative specific configurations
|
||||||
|
*
|
||||||
|
* @author Sandro La Bruzzo, Claudio Atzori
|
||||||
|
*/
|
||||||
|
public class CollectorWorkerApplication {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
|
||||||
|
|
||||||
|
private FileSystem fileSystem;
|
||||||
|
|
||||||
|
public CollectorWorkerApplication(FileSystem fileSystem) {
|
||||||
|
this.fileSystem = fileSystem;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param args
|
||||||
|
*/
|
||||||
|
public static void main(final String[] args)
|
||||||
|
throws ParseException, IOException, UnknownCollectorPluginException, CollectorException {
|
||||||
|
|
||||||
|
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
CollectorWorkerApplication.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json")));
|
||||||
|
argumentParser.parseArgument(args);
|
||||||
|
|
||||||
|
log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024));
|
||||||
|
|
||||||
|
final String hdfsuri = argumentParser.get("namenode");
|
||||||
|
log.info("hdfsURI is {}", hdfsuri);
|
||||||
|
|
||||||
|
final String apiDescriptor = argumentParser.get("apidescriptor");
|
||||||
|
log.info("apiDescriptor is {}", apiDescriptor);
|
||||||
|
|
||||||
|
final String mdStoreVersion = argumentParser.get("mdStoreVersion");
|
||||||
|
log.info("mdStoreVersion is {}", mdStoreVersion);
|
||||||
|
|
||||||
|
final String dnetMessageManagerURL = argumentParser.get(DNET_MESSAGE_MGR_URL);
|
||||||
|
log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL);
|
||||||
|
|
||||||
|
final String workflowId = argumentParser.get("workflowId");
|
||||||
|
log.info("workflowId is {}", workflowId);
|
||||||
|
|
||||||
|
final HttpClientParams clientParams = getClientParams(argumentParser);
|
||||||
|
|
||||||
|
final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class);
|
||||||
|
final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsuri));
|
||||||
|
|
||||||
|
new CollectorWorkerApplication(fileSystem)
|
||||||
|
.run(mdStoreVersion, clientParams, api, dnetMessageManagerURL, workflowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void run(String mdStoreVersion, HttpClientParams clientParams, ApiDescriptor api,
|
||||||
|
String dnetMessageManagerURL, String workflowId)
|
||||||
|
throws IOException, CollectorException, UnknownCollectorPluginException {
|
||||||
|
|
||||||
|
final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
|
||||||
|
final MessageSender ms = new MessageSender(dnetMessageManagerURL, workflowId);
|
||||||
|
|
||||||
|
try (AggregatorReport report = new AggregatorReport(ms)) {
|
||||||
|
new CollectorWorker(api, fileSystem, currentVersion, clientParams, report).collect();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static HttpClientParams getClientParams(ArgumentApplicationParser argumentParser) {
|
||||||
|
final HttpClientParams clientParams = new HttpClientParams();
|
||||||
|
clientParams
|
||||||
|
.setMaxNumberOfRetry(
|
||||||
|
Optional
|
||||||
|
.ofNullable(argumentParser.get(MAX_NUMBER_OF_RETRY))
|
||||||
|
.map(Integer::parseInt)
|
||||||
|
.orElse(HttpClientParams._maxNumberOfRetry));
|
||||||
|
log.info("maxNumberOfRetry is {}", clientParams.getMaxNumberOfRetry());
|
||||||
|
|
||||||
|
clientParams
|
||||||
|
.setRequestDelay(
|
||||||
|
Optional
|
||||||
|
.ofNullable(argumentParser.get(REQUEST_DELAY))
|
||||||
|
.map(Integer::parseInt)
|
||||||
|
.orElse(HttpClientParams._requestDelay));
|
||||||
|
log.info("requestDelay is {}", clientParams.getRequestDelay());
|
||||||
|
|
||||||
|
clientParams
|
||||||
|
.setRetryDelay(
|
||||||
|
Optional
|
||||||
|
.ofNullable(argumentParser.get(RETRY_DELAY))
|
||||||
|
.map(Integer::parseInt)
|
||||||
|
.orElse(HttpClientParams._retryDelay));
|
||||||
|
log.info("retryDelay is {}", clientParams.getRetryDelay());
|
||||||
|
|
||||||
|
clientParams
|
||||||
|
.setConnectTimeOut(
|
||||||
|
Optional
|
||||||
|
.ofNullable(argumentParser.get(CONNECT_TIMEOUT))
|
||||||
|
.map(Integer::parseInt)
|
||||||
|
.orElse(HttpClientParams._connectTimeOut));
|
||||||
|
log.info("connectTimeOut is {}", clientParams.getConnectTimeOut());
|
||||||
|
|
||||||
|
clientParams
|
||||||
|
.setReadTimeOut(
|
||||||
|
Optional
|
||||||
|
.ofNullable(argumentParser.get(READ_TIMEOUT))
|
||||||
|
.map(Integer::parseInt)
|
||||||
|
.orElse(HttpClientParams._readTimeOut));
|
||||||
|
log.info("readTimeOut is {}", clientParams.getReadTimeOut());
|
||||||
|
return clientParams;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,28 +1,26 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection;
|
package eu.dnetlib.dhp.collection;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.Constants.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
import static eu.dnetlib.dhp.utils.DHPUtils.*;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.cli.*;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.hadoop.io.IntWritable;
|
import org.apache.hadoop.io.IntWritable;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Encoder;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.expressions.Aggregator;
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
@ -30,19 +28,172 @@ import org.dom4j.io.SAXReader;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
|
||||||
import eu.dnetlib.dhp.model.mdstore.Provenance;
|
import eu.dnetlib.dhp.schema.mdstore.Provenance;
|
||||||
import eu.dnetlib.message.Message;
|
import scala.Tuple2;
|
||||||
import eu.dnetlib.message.MessageManager;
|
|
||||||
import eu.dnetlib.message.MessageType;
|
|
||||||
|
|
||||||
public class GenerateNativeStoreSparkJob {
|
public class GenerateNativeStoreSparkJob {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class);
|
private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
GenerateNativeStoreSparkJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/collection/generate_native_input_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
final String provenanceArgument = parser.get("provenance");
|
||||||
|
log.info("Provenance is {}", provenanceArgument);
|
||||||
|
final Provenance provenance = MAPPER.readValue(provenanceArgument, Provenance.class);
|
||||||
|
|
||||||
|
final String dateOfCollectionArgs = parser.get("dateOfCollection");
|
||||||
|
log.info("dateOfCollection is {}", dateOfCollectionArgs);
|
||||||
|
final Long dateOfCollection = new Long(dateOfCollectionArgs);
|
||||||
|
|
||||||
|
String mdStoreVersion = parser.get("mdStoreVersion");
|
||||||
|
log.info("mdStoreVersion is {}", mdStoreVersion);
|
||||||
|
|
||||||
|
final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
|
||||||
|
|
||||||
|
String readMdStoreVersionParam = parser.get("readMdStoreVersion");
|
||||||
|
log.info("readMdStoreVersion is {}", readMdStoreVersionParam);
|
||||||
|
|
||||||
|
final MDStoreVersion readMdStoreVersion = StringUtils.isBlank(readMdStoreVersionParam) ? null
|
||||||
|
: MAPPER.readValue(readMdStoreVersionParam, MDStoreVersion.class);
|
||||||
|
|
||||||
|
final String xpath = parser.get("xpath");
|
||||||
|
log.info("xpath is {}", xpath);
|
||||||
|
|
||||||
|
final String encoding = parser.get("encoding");
|
||||||
|
log.info("encoding is {}", encoding);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> createNativeMDStore(
|
||||||
|
spark, provenance, dateOfCollection, xpath, encoding, currentVersion, readMdStoreVersion));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void createNativeMDStore(SparkSession spark,
|
||||||
|
Provenance provenance,
|
||||||
|
Long dateOfCollection,
|
||||||
|
String xpath,
|
||||||
|
String encoding,
|
||||||
|
MDStoreVersion currentVersion,
|
||||||
|
MDStoreVersion readVersion) throws IOException {
|
||||||
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
final LongAccumulator totalItems = sc.sc().longAccumulator(CONTENT_TOTALITEMS);
|
||||||
|
final LongAccumulator invalidRecords = sc.sc().longAccumulator(CONTENT_INVALIDRECORDS);
|
||||||
|
|
||||||
|
final String seqFilePath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
|
||||||
|
final JavaRDD<MetadataRecord> nativeStore = sc
|
||||||
|
.sequenceFile(seqFilePath, IntWritable.class, Text.class)
|
||||||
|
.map(
|
||||||
|
item -> parseRecord(
|
||||||
|
item._2().toString(),
|
||||||
|
xpath,
|
||||||
|
encoding,
|
||||||
|
provenance,
|
||||||
|
dateOfCollection,
|
||||||
|
totalItems,
|
||||||
|
invalidRecords))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.distinct();
|
||||||
|
|
||||||
|
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
|
||||||
|
final Dataset<MetadataRecord> mdstore = spark.createDataset(nativeStore.rdd(), encoder);
|
||||||
|
|
||||||
|
final String targetPath = currentVersion.getHdfsPath() + MDSTORE_DATA_PATH;
|
||||||
|
|
||||||
|
if (readVersion != null) { // INCREMENTAL MODE
|
||||||
|
log.info("updating {} incrementally with {}", targetPath, readVersion.getHdfsPath());
|
||||||
|
Dataset<MetadataRecord> currentMdStoreVersion = spark
|
||||||
|
.read()
|
||||||
|
.load(readVersion.getHdfsPath() + MDSTORE_DATA_PATH)
|
||||||
|
.as(encoder);
|
||||||
|
TypedColumn<MetadataRecord, MetadataRecord> aggregator = new MDStoreAggregator().toColumn();
|
||||||
|
|
||||||
|
final Dataset<MetadataRecord> map = currentMdStoreVersion
|
||||||
|
.union(mdstore)
|
||||||
|
.groupByKey(
|
||||||
|
(MapFunction<MetadataRecord, String>) MetadataRecord::getId,
|
||||||
|
Encoders.STRING())
|
||||||
|
.agg(aggregator)
|
||||||
|
.map((MapFunction<Tuple2<String, MetadataRecord>, MetadataRecord>) Tuple2::_2, encoder);
|
||||||
|
|
||||||
|
map.select("id").takeAsList(100).forEach(s -> log.info(s.toString()));
|
||||||
|
|
||||||
|
saveDataset(map, targetPath);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
saveDataset(mdstore, targetPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
final Long total = spark.read().load(targetPath).count();
|
||||||
|
log.info("collected {} records for datasource '{}'", total, provenance.getDatasourceName());
|
||||||
|
|
||||||
|
writeHdfsFile(
|
||||||
|
spark.sparkContext().hadoopConfiguration(), total.toString(),
|
||||||
|
currentVersion.getHdfsPath() + MDSTORE_SIZE_PATH);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class MDStoreAggregator extends Aggregator<MetadataRecord, MetadataRecord, MetadataRecord> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MetadataRecord zero() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MetadataRecord reduce(MetadataRecord b, MetadataRecord a) {
|
||||||
|
return getLatestRecord(b, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MetadataRecord merge(MetadataRecord b, MetadataRecord a) {
|
||||||
|
return getLatestRecord(b, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
private MetadataRecord getLatestRecord(MetadataRecord b, MetadataRecord a) {
|
||||||
|
if (b == null)
|
||||||
|
return a;
|
||||||
|
|
||||||
|
if (a == null)
|
||||||
|
return b;
|
||||||
|
return (a.getDateOfCollection() > b.getDateOfCollection()) ? a : b;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MetadataRecord finish(MetadataRecord r) {
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Encoder<MetadataRecord> bufferEncoder() {
|
||||||
|
return Encoders.bean(MetadataRecord.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Encoder<MetadataRecord> outputEncoder() {
|
||||||
|
return Encoders.bean(MetadataRecord.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public static MetadataRecord parseRecord(
|
public static MetadataRecord parseRecord(
|
||||||
final String input,
|
final String input,
|
||||||
final String xpath,
|
final String xpath,
|
||||||
|
@ -64,112 +215,11 @@ public class GenerateNativeStoreSparkJob {
|
||||||
invalidRecords.add(1);
|
invalidRecords.add(1);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection);
|
return new MetadataRecord(originalIdentifier, encoding, provenance, document.asXML(), dateOfCollection);
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
if (invalidRecords != null)
|
invalidRecords.add(1);
|
||||||
invalidRecords.add(1);
|
|
||||||
e.printStackTrace();
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
|
||||||
IOUtils
|
|
||||||
.toString(
|
|
||||||
GenerateNativeStoreSparkJob.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/collection/collection_input_parameters.json")));
|
|
||||||
parser.parseArgument(args);
|
|
||||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
|
||||||
final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class);
|
|
||||||
final long dateOfCollection = new Long(parser.get("dateOfCollection"));
|
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
final Map<String, String> ongoingMap = new HashMap<>();
|
|
||||||
final Map<String, String> reportMap = new HashMap<>();
|
|
||||||
|
|
||||||
final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
|
|
||||||
final JavaPairRDD<IntWritable, Text> inputRDD = sc
|
|
||||||
.sequenceFile(parser.get("input"), IntWritable.class, Text.class);
|
|
||||||
|
|
||||||
final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems");
|
|
||||||
final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords");
|
|
||||||
|
|
||||||
final MessageManager manager = new MessageManager(
|
|
||||||
parser.get("rabbitHost"),
|
|
||||||
parser.get("rabbitUser"),
|
|
||||||
parser.get("rabbitPassword"),
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
null);
|
|
||||||
|
|
||||||
final JavaRDD<MetadataRecord> mappeRDD = inputRDD
|
|
||||||
.map(
|
|
||||||
item -> parseRecord(
|
|
||||||
item._2().toString(),
|
|
||||||
parser.get("xpath"),
|
|
||||||
parser.get("encoding"),
|
|
||||||
provenance,
|
|
||||||
dateOfCollection,
|
|
||||||
totalItems,
|
|
||||||
invalidRecords))
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.distinct();
|
|
||||||
|
|
||||||
ongoingMap.put("ongoing", "0");
|
|
||||||
if (!test) {
|
|
||||||
manager
|
|
||||||
.sendMessage(
|
|
||||||
new Message(
|
|
||||||
parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap),
|
|
||||||
parser.get("rabbitOngoingQueue"),
|
|
||||||
true,
|
|
||||||
false);
|
|
||||||
}
|
|
||||||
|
|
||||||
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
|
|
||||||
final Dataset<MetadataRecord> mdstore = spark.createDataset(mappeRDD.rdd(), encoder);
|
|
||||||
final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords");
|
|
||||||
mdStoreRecords.add(mdstore.count());
|
|
||||||
ongoingMap.put("ongoing", "" + totalItems.value());
|
|
||||||
if (!test) {
|
|
||||||
manager
|
|
||||||
.sendMessage(
|
|
||||||
new Message(
|
|
||||||
parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap),
|
|
||||||
parser.get("rabbitOngoingQueue"),
|
|
||||||
true,
|
|
||||||
false);
|
|
||||||
}
|
|
||||||
mdstore.write().format("parquet").save(parser.get("output"));
|
|
||||||
reportMap.put("inputItem", "" + totalItems.value());
|
|
||||||
reportMap.put("invalidRecords", "" + invalidRecords.value());
|
|
||||||
reportMap.put("mdStoreSize", "" + mdStoreRecords.value());
|
|
||||||
if (!test) {
|
|
||||||
manager
|
|
||||||
.sendMessage(
|
|
||||||
new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
|
|
||||||
parser.get("rabbitReportQueue"),
|
|
||||||
true,
|
|
||||||
false);
|
|
||||||
manager.close();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,94 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bundles the http connection parameters driving the client behaviour.
|
||||||
|
*/
|
||||||
|
public class HttpClientParams {
|
||||||
|
|
||||||
|
// Defaults
|
||||||
|
public static int _maxNumberOfRetry = 3;
|
||||||
|
public static int _requestDelay = 0; // milliseconds
|
||||||
|
public static int _retryDelay = 10; // seconds
|
||||||
|
public static int _connectTimeOut = 10; // seconds
|
||||||
|
public static int _readTimeOut = 30; // seconds
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maximum number of allowed retires before failing
|
||||||
|
*/
|
||||||
|
private int maxNumberOfRetry;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delay between request (Milliseconds)
|
||||||
|
*/
|
||||||
|
private int requestDelay;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Time to wait after a failure before retrying (Seconds)
|
||||||
|
*/
|
||||||
|
private int retryDelay;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Connect timeout (Seconds)
|
||||||
|
*/
|
||||||
|
private int connectTimeOut;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read timeout (Seconds)
|
||||||
|
*/
|
||||||
|
private int readTimeOut;
|
||||||
|
|
||||||
|
public HttpClientParams() {
|
||||||
|
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut);
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
|
||||||
|
int readTimeOut) {
|
||||||
|
this.maxNumberOfRetry = maxNumberOfRetry;
|
||||||
|
this.requestDelay = requestDelay;
|
||||||
|
this.retryDelay = retryDelay;
|
||||||
|
this.connectTimeOut = connectTimeOut;
|
||||||
|
this.readTimeOut = readTimeOut;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMaxNumberOfRetry() {
|
||||||
|
return maxNumberOfRetry;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMaxNumberOfRetry(int maxNumberOfRetry) {
|
||||||
|
this.maxNumberOfRetry = maxNumberOfRetry;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getRequestDelay() {
|
||||||
|
return requestDelay;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRequestDelay(int requestDelay) {
|
||||||
|
this.requestDelay = requestDelay;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getRetryDelay() {
|
||||||
|
return retryDelay;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRetryDelay(int retryDelay) {
|
||||||
|
this.retryDelay = retryDelay;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setConnectTimeOut(int connectTimeOut) {
|
||||||
|
this.connectTimeOut = connectTimeOut;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getConnectTimeOut() {
|
||||||
|
return connectTimeOut;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getReadTimeOut() {
|
||||||
|
return readTimeOut;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReadTimeOut(int readTimeOut) {
|
||||||
|
this.readTimeOut = readTimeOut;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,259 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.utils.DHPUtils.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.net.*;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.math.NumberUtils;
|
||||||
|
import org.apache.http.HttpHeaders;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java
|
||||||
|
*
|
||||||
|
* @author jochen, michele, andrea, alessia, claudio
|
||||||
|
*/
|
||||||
|
public class HttpConnector2 {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class);
|
||||||
|
|
||||||
|
private static final String REPORT_PREFIX = "http:";
|
||||||
|
|
||||||
|
private HttpClientParams clientParams;
|
||||||
|
|
||||||
|
private String responseType = null;
|
||||||
|
|
||||||
|
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
||||||
|
|
||||||
|
public HttpConnector2() {
|
||||||
|
this(new HttpClientParams());
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpConnector2(HttpClientParams clientParams) {
|
||||||
|
this.clientParams = clientParams;
|
||||||
|
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
|
||||||
|
*/
|
||||||
|
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException {
|
||||||
|
return IOUtils.toInputStream(getInputSource(requestUrl));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
|
||||||
|
*/
|
||||||
|
public String getInputSource(final String requestUrl) throws CollectorException {
|
||||||
|
return attemptDownloadAsString(requestUrl, 1, new AggregatorReport());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given the URL returns the content via HTTP GET
|
||||||
|
*
|
||||||
|
* @param requestUrl the URL
|
||||||
|
* @param report the list of errors
|
||||||
|
* @return the content of the downloaded resource
|
||||||
|
* @throws CollectorException when retrying more than maxNumberOfRetry times
|
||||||
|
*/
|
||||||
|
public String getInputSource(final String requestUrl, AggregatorReport report)
|
||||||
|
throws CollectorException {
|
||||||
|
return attemptDownloadAsString(requestUrl, 1, report);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String attemptDownloadAsString(final String requestUrl, final int retryNumber,
|
||||||
|
final AggregatorReport report) throws CollectorException {
|
||||||
|
|
||||||
|
try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) {
|
||||||
|
return IOUtils.toString(s);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
throw new CollectorException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
|
||||||
|
final AggregatorReport report) throws CollectorException, IOException {
|
||||||
|
|
||||||
|
if (retryNumber > getClientParams().getMaxNumberOfRetry()) {
|
||||||
|
final String msg = String
|
||||||
|
.format(
|
||||||
|
"Max number of retries (%s/%s) exceeded, failing.",
|
||||||
|
retryNumber, getClientParams().getMaxNumberOfRetry());
|
||||||
|
log.error(msg);
|
||||||
|
throw new CollectorException(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
|
||||||
|
|
||||||
|
InputStream input = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (getClientParams().getRequestDelay() > 0) {
|
||||||
|
backoffAndSleep(getClientParams().getRequestDelay());
|
||||||
|
}
|
||||||
|
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
||||||
|
urlConn.setInstanceFollowRedirects(false);
|
||||||
|
urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000);
|
||||||
|
urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000);
|
||||||
|
urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent);
|
||||||
|
|
||||||
|
if (log.isDebugEnabled()) {
|
||||||
|
logHeaderFields(urlConn);
|
||||||
|
}
|
||||||
|
|
||||||
|
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
||||||
|
if (is2xx(urlConn.getResponseCode())) {
|
||||||
|
input = urlConn.getInputStream();
|
||||||
|
responseType = urlConn.getContentType();
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
if (is3xx(urlConn.getResponseCode())) {
|
||||||
|
// REDIRECTS
|
||||||
|
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
||||||
|
log.info(String.format("The requested url has been moved to %s", newUrl));
|
||||||
|
report
|
||||||
|
.put(
|
||||||
|
REPORT_PREFIX + urlConn.getResponseCode(),
|
||||||
|
String.format("Moved to: %s", newUrl));
|
||||||
|
urlConn.disconnect();
|
||||||
|
if (retryAfter > 0) {
|
||||||
|
backoffAndSleep(retryAfter);
|
||||||
|
}
|
||||||
|
return attemptDownload(newUrl, retryNumber + 1, report);
|
||||||
|
}
|
||||||
|
if (is4xx(urlConn.getResponseCode())) {
|
||||||
|
// CLIENT ERROR, DO NOT RETRY
|
||||||
|
report
|
||||||
|
.put(
|
||||||
|
REPORT_PREFIX + urlConn.getResponseCode(),
|
||||||
|
String
|
||||||
|
.format(
|
||||||
|
"%s error: %s", requestUrl, urlConn.getResponseMessage()));
|
||||||
|
throw new CollectorException("4xx error: request will not be repeated. " + report);
|
||||||
|
}
|
||||||
|
if (is5xx(urlConn.getResponseCode())) {
|
||||||
|
// SERVER SIDE ERRORS RETRY ONLY on 503
|
||||||
|
switch (urlConn.getResponseCode()) {
|
||||||
|
case HttpURLConnection.HTTP_UNAVAILABLE:
|
||||||
|
if (retryAfter > 0) {
|
||||||
|
log
|
||||||
|
.warn(
|
||||||
|
requestUrl + " - waiting and repeating request after suggested retry-after "
|
||||||
|
+ retryAfter + " sec.");
|
||||||
|
backoffAndSleep(retryAfter * 1000);
|
||||||
|
} else {
|
||||||
|
log
|
||||||
|
.warn(
|
||||||
|
requestUrl + " - waiting and repeating request after default delay of "
|
||||||
|
+ getClientParams().getRetryDelay() + " sec.");
|
||||||
|
backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000);
|
||||||
|
}
|
||||||
|
report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl);
|
||||||
|
urlConn.disconnect();
|
||||||
|
return attemptDownload(requestUrl, retryNumber + 1, report);
|
||||||
|
default:
|
||||||
|
report
|
||||||
|
.put(
|
||||||
|
REPORT_PREFIX + urlConn.getResponseCode(),
|
||||||
|
String
|
||||||
|
.format(
|
||||||
|
"%s Error: %s", requestUrl, urlConn.getResponseMessage()));
|
||||||
|
throw new CollectorException(urlConn.getResponseCode() + " error " + report);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new CollectorException(
|
||||||
|
String
|
||||||
|
.format(
|
||||||
|
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
|
||||||
|
MAPPER.writeValueAsString(report)));
|
||||||
|
} catch (MalformedURLException | UnknownHostException e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
|
throw new CollectorException(e.getMessage(), e);
|
||||||
|
} catch (SocketTimeoutException | SocketException e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
|
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
|
||||||
|
return attemptDownload(requestUrl, retryNumber + 1, report);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
||||||
|
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
||||||
|
|
||||||
|
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
||||||
|
if (e.getKey() != null) {
|
||||||
|
for (String v : e.getValue()) {
|
||||||
|
log.debug(" key: " + e.getKey() + " - value: " + v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void backoffAndSleep(int sleepTimeMs) throws CollectorException {
|
||||||
|
log.info("I'm going to sleep for {}ms", sleepTimeMs);
|
||||||
|
try {
|
||||||
|
Thread.sleep(sleepTimeMs);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
throw new CollectorException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
||||||
|
for (String key : headerMap.keySet()) {
|
||||||
|
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (headerMap.get(key).size() > 0)
|
||||||
|
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
|
||||||
|
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorException {
|
||||||
|
for (String key : headerMap.keySet()) {
|
||||||
|
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) {
|
||||||
|
return headerMap.get(key).get(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING");
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean is2xx(final int statusCode) {
|
||||||
|
return statusCode >= 200 && statusCode <= 299;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean is4xx(final int statusCode) {
|
||||||
|
return statusCode >= 400 && statusCode <= 499;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean is3xx(final int statusCode) {
|
||||||
|
return statusCode >= 300 && statusCode <= 399;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean is5xx(final int statusCode) {
|
||||||
|
return statusCode >= 500 && statusCode <= 599;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getResponseType() {
|
||||||
|
return responseType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpClientParams getClientParams() {
|
||||||
|
return clientParams;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setClientParams(HttpClientParams clientParams) {
|
||||||
|
this.clientParams = clientParams;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,84 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
public class JsonUtils {
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(JsonUtils.class);
|
||||||
|
|
||||||
|
public static final String wrapName = "recordWrap";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
|
||||||
|
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
|
||||||
|
* and work-around for the JSON to XML converting of org.json.XML-package.
|
||||||
|
*
|
||||||
|
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"],
|
||||||
|
*
|
||||||
|
* @param jsonInput
|
||||||
|
* @return convertedJsonKeynameOutput
|
||||||
|
*/
|
||||||
|
public String syntaxConvertJsonKeyNames(String jsonInput) {
|
||||||
|
|
||||||
|
log.trace("before convertJsonKeyNames: " + jsonInput);
|
||||||
|
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
|
||||||
|
// replace ' 's in JSON Namens with '_'
|
||||||
|
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
|
||||||
|
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
|
||||||
|
}
|
||||||
|
|
||||||
|
// replace forward-slash (sign '/' ) in JSON Names with '_'
|
||||||
|
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
|
||||||
|
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
|
||||||
|
}
|
||||||
|
|
||||||
|
// replace '(' in JSON Names with ''
|
||||||
|
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
|
||||||
|
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
|
||||||
|
}
|
||||||
|
|
||||||
|
// replace ')' in JSON Names with ''
|
||||||
|
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
|
||||||
|
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
|
||||||
|
}
|
||||||
|
|
||||||
|
// add prefix of startNumbers in JSON Keynames with 'n_'
|
||||||
|
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
|
||||||
|
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
|
||||||
|
}
|
||||||
|
// add prefix of only numbers in JSON Keynames with 'm_'
|
||||||
|
while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
|
||||||
|
jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
|
||||||
|
}
|
||||||
|
|
||||||
|
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
|
||||||
|
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
|
||||||
|
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
|
||||||
|
}
|
||||||
|
|
||||||
|
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
|
||||||
|
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
|
||||||
|
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
|
||||||
|
// }
|
||||||
|
|
||||||
|
// replace '=' in JSON Keynames with '-'
|
||||||
|
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
|
||||||
|
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
|
||||||
|
}
|
||||||
|
|
||||||
|
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
|
||||||
|
return jsonInput;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String convertToXML(final String jsonRecord) {
|
||||||
|
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||||
|
org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord));
|
||||||
|
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
|
||||||
|
log.trace("before inputStream: " + resultXml);
|
||||||
|
resultXml = XmlCleaner.cleanAllEntities(resultXml);
|
||||||
|
log.trace("after cleaning: " + resultXml);
|
||||||
|
return resultXml;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection;
|
||||||
|
|
||||||
|
public class UnknownCollectorPluginException extends Exception {
|
||||||
|
|
||||||
|
/** */
|
||||||
|
private static final long serialVersionUID = -290723075076039757L;
|
||||||
|
|
||||||
|
public UnknownCollectorPluginException() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public UnknownCollectorPluginException(
|
||||||
|
final String message,
|
||||||
|
final Throwable cause,
|
||||||
|
final boolean enableSuppression,
|
||||||
|
final boolean writableStackTrace) {
|
||||||
|
super(message, cause, enableSuppression, writableStackTrace);
|
||||||
|
}
|
||||||
|
|
||||||
|
public UnknownCollectorPluginException(final String message, final Throwable cause) {
|
||||||
|
super(message, cause);
|
||||||
|
}
|
||||||
|
|
||||||
|
public UnknownCollectorPluginException(final String message) {
|
||||||
|
super(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
public UnknownCollectorPluginException(final Throwable cause) {
|
||||||
|
super(cause);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.worker.utils;
|
package eu.dnetlib.dhp.collection;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
|
@ -3,10 +3,21 @@ package eu.dnetlib.dhp.collection.plugin;
|
||||||
|
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
|
import eu.dnetlib.dhp.collection.CollectorException;
|
||||||
|
|
||||||
public interface CollectorPlugin {
|
public interface CollectorPlugin {
|
||||||
|
|
||||||
Stream<String> collect(ApiDescriptor api) throws DnetCollectorException;
|
enum NAME {
|
||||||
|
oai, other, rest_json2xml;
|
||||||
|
|
||||||
|
public enum OTHER_NAME {
|
||||||
|
mdstore_mongodb_dump, mdstore_mongodb
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection.plugin.mongodb;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Spliterator;
|
||||||
|
import java.util.Spliterators;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import org.bson.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.mongodb.client.MongoCollection;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
|
import eu.dnetlib.dhp.collection.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
|
import eu.dnetlib.dhp.common.MdstoreClient;
|
||||||
|
|
||||||
|
public class MDStoreCollectorPlugin implements CollectorPlugin {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(MDStoreCollectorPlugin.class);
|
||||||
|
|
||||||
|
public static final String MONGODB_DBNAME = "mongodb_dbname";
|
||||||
|
public static final String MDSTORE_ID = "mdstore_id";
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
|
||||||
|
|
||||||
|
final String mongoBaseUrl = Optional
|
||||||
|
.ofNullable(api.getBaseUrl())
|
||||||
|
.orElseThrow(
|
||||||
|
() -> new CollectorException(
|
||||||
|
"missing mongodb baseUrl, expected in eu.dnetlib.dhp.collection.ApiDescriptor.baseUrl"));
|
||||||
|
log.info("mongoBaseUrl: {}", mongoBaseUrl);
|
||||||
|
|
||||||
|
final String dbName = Optional
|
||||||
|
.ofNullable(api.getParams().get(MONGODB_DBNAME))
|
||||||
|
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_DBNAME)));
|
||||||
|
log.info("dbName: {}", dbName);
|
||||||
|
|
||||||
|
final String mdId = Optional
|
||||||
|
.ofNullable(api.getParams().get(MDSTORE_ID))
|
||||||
|
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MDSTORE_ID)));
|
||||||
|
log.info("mdId: {}", mdId);
|
||||||
|
|
||||||
|
final MdstoreClient client = new MdstoreClient(mongoBaseUrl, dbName);
|
||||||
|
final MongoCollection<Document> mdstore = client.mdStore(mdId);
|
||||||
|
long size = mdstore.count();
|
||||||
|
|
||||||
|
return StreamSupport
|
||||||
|
.stream(
|
||||||
|
Spliterators.spliterator(mdstore.find().iterator(), size, Spliterator.SIZED), false)
|
||||||
|
.map(doc -> doc.getString("body"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection.plugin.mongodb;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
|
import eu.dnetlib.dhp.collection.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
|
public class MongoDbDumpCollectorPlugin implements CollectorPlugin {
|
||||||
|
|
||||||
|
public static final String PATH_PARAM = "path";
|
||||||
|
public static final String BODY_JSONPATH = "$.body";
|
||||||
|
|
||||||
|
public FileSystem fileSystem;
|
||||||
|
|
||||||
|
public MongoDbDumpCollectorPlugin(FileSystem fileSystem) {
|
||||||
|
this.fileSystem = fileSystem;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
|
||||||
|
|
||||||
|
final Path path = Optional
|
||||||
|
.ofNullable(api.getParams().get("path"))
|
||||||
|
.map(Path::new)
|
||||||
|
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", PATH_PARAM)));
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (!fileSystem.exists(path)) {
|
||||||
|
throw new CollectorException("path does not exist: " + path.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return new BufferedReader(
|
||||||
|
new InputStreamReader(new GZIPInputStream(fileSystem.open(path)), Charset.defaultCharset()))
|
||||||
|
.lines()
|
||||||
|
.map(s -> DHPUtils.getJPathString(BODY_JSONPATH, s));
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new CollectorException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -13,9 +13,11 @@ import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Iterators;
|
import com.google.common.collect.Iterators;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
|
import eu.dnetlib.dhp.collection.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.collection.HttpClientParams;
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
|
||||||
|
|
||||||
public class OaiCollectorPlugin implements CollectorPlugin {
|
public class OaiCollectorPlugin implements CollectorPlugin {
|
||||||
|
|
||||||
|
@ -26,8 +28,15 @@ public class OaiCollectorPlugin implements CollectorPlugin {
|
||||||
|
|
||||||
private OaiIteratorFactory oaiIteratorFactory;
|
private OaiIteratorFactory oaiIteratorFactory;
|
||||||
|
|
||||||
|
private HttpClientParams clientParams;
|
||||||
|
|
||||||
|
public OaiCollectorPlugin(HttpClientParams clientParams) {
|
||||||
|
this.clientParams = clientParams;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Stream<String> collect(final ApiDescriptor api) throws DnetCollectorException {
|
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report)
|
||||||
|
throws CollectorException {
|
||||||
final String baseUrl = api.getBaseUrl();
|
final String baseUrl = api.getBaseUrl();
|
||||||
final String mdFormat = api.getParams().get(FORMAT_PARAM);
|
final String mdFormat = api.getParams().get(FORMAT_PARAM);
|
||||||
final String setParam = api.getParams().get(OAI_SET_PARAM);
|
final String setParam = api.getParams().get(OAI_SET_PARAM);
|
||||||
|
@ -46,26 +55,26 @@ public class OaiCollectorPlugin implements CollectorPlugin {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (baseUrl == null || baseUrl.isEmpty()) {
|
if (baseUrl == null || baseUrl.isEmpty()) {
|
||||||
throw new DnetCollectorException("Param 'baseurl' is null or empty");
|
throw new CollectorException("Param 'baseurl' is null or empty");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mdFormat == null || mdFormat.isEmpty()) {
|
if (mdFormat == null || mdFormat.isEmpty()) {
|
||||||
throw new DnetCollectorException("Param 'mdFormat' is null or empty");
|
throw new CollectorException("Param 'mdFormat' is null or empty");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||||
throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
|
throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||||
throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
|
throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
final Iterator<Iterator<String>> iters = sets
|
final Iterator<Iterator<String>> iters = sets
|
||||||
.stream()
|
.stream()
|
||||||
.map(
|
.map(
|
||||||
set -> getOaiIteratorFactory()
|
set -> getOaiIteratorFactory()
|
||||||
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate))
|
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate, getClientParams(), report))
|
||||||
.iterator();
|
.iterator();
|
||||||
|
|
||||||
return StreamSupport
|
return StreamSupport
|
||||||
|
@ -79,4 +88,12 @@ public class OaiCollectorPlugin implements CollectorPlugin {
|
||||||
}
|
}
|
||||||
return oaiIteratorFactory;
|
return oaiIteratorFactory;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public HttpClientParams getClientParams() {
|
||||||
|
return clientParams;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setClientParams(HttpClientParams clientParams) {
|
||||||
|
this.clientParams = clientParams;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.oai;
|
package eu.dnetlib.dhp.collection.plugin.oai;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.io.StringWriter;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
@ -9,24 +11,28 @@ import java.util.Queue;
|
||||||
import java.util.concurrent.PriorityBlockingQueue;
|
import java.util.concurrent.PriorityBlockingQueue;
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.commons.logging.Log;
|
|
||||||
import org.apache.commons.logging.LogFactory;
|
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
|
import org.dom4j.DocumentHelper;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
import org.dom4j.io.OutputFormat;
|
||||||
import org.dom4j.io.SAXReader;
|
import org.dom4j.io.SAXReader;
|
||||||
|
import org.dom4j.io.XMLWriter;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
|
import eu.dnetlib.dhp.collection.CollectorException;
|
||||||
import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner;
|
import eu.dnetlib.dhp.collection.HttpConnector2;
|
||||||
|
import eu.dnetlib.dhp.collection.XmlCleaner;
|
||||||
|
|
||||||
public class OaiIterator implements Iterator<String> {
|
public class OaiIterator implements Iterator<String> {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on
|
private static final Logger log = LoggerFactory.getLogger(OaiIterator.class);
|
||||||
// 11/24/08 5:02 PM
|
|
||||||
|
private final static String REPORT_PREFIX = "oai:";
|
||||||
|
|
||||||
private final Queue<String> queue = new PriorityBlockingQueue<>();
|
private final Queue<String> queue = new PriorityBlockingQueue<>();
|
||||||
private final SAXReader reader = new SAXReader();
|
|
||||||
|
|
||||||
private final String baseUrl;
|
private final String baseUrl;
|
||||||
private final String set;
|
private final String set;
|
||||||
|
@ -35,7 +41,8 @@ public class OaiIterator implements Iterator<String> {
|
||||||
private final String untilDate;
|
private final String untilDate;
|
||||||
private String token;
|
private String token;
|
||||||
private boolean started;
|
private boolean started;
|
||||||
private final HttpConnector httpConnector;
|
private final HttpConnector2 httpConnector;
|
||||||
|
private AggregatorReport report;
|
||||||
|
|
||||||
public OaiIterator(
|
public OaiIterator(
|
||||||
final String baseUrl,
|
final String baseUrl,
|
||||||
|
@ -43,7 +50,8 @@ public class OaiIterator implements Iterator<String> {
|
||||||
final String set,
|
final String set,
|
||||||
final String fromDate,
|
final String fromDate,
|
||||||
final String untilDate,
|
final String untilDate,
|
||||||
final HttpConnector httpConnector) {
|
final HttpConnector2 httpConnector,
|
||||||
|
final AggregatorReport report) {
|
||||||
this.baseUrl = baseUrl;
|
this.baseUrl = baseUrl;
|
||||||
this.mdFormat = mdFormat;
|
this.mdFormat = mdFormat;
|
||||||
this.set = set;
|
this.set = set;
|
||||||
|
@ -51,6 +59,7 @@ public class OaiIterator implements Iterator<String> {
|
||||||
this.untilDate = untilDate;
|
this.untilDate = untilDate;
|
||||||
this.started = false;
|
this.started = false;
|
||||||
this.httpConnector = httpConnector;
|
this.httpConnector = httpConnector;
|
||||||
|
this.report = report;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifyStarted() {
|
private void verifyStarted() {
|
||||||
|
@ -58,7 +67,7 @@ public class OaiIterator implements Iterator<String> {
|
||||||
this.started = true;
|
this.started = true;
|
||||||
try {
|
try {
|
||||||
this.token = firstPage();
|
this.token = firstPage();
|
||||||
} catch (final DnetCollectorException e) {
|
} catch (final CollectorException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -80,7 +89,7 @@ public class OaiIterator implements Iterator<String> {
|
||||||
while (queue.isEmpty() && token != null && !token.isEmpty()) {
|
while (queue.isEmpty() && token != null && !token.isEmpty()) {
|
||||||
try {
|
try {
|
||||||
token = otherPages(token);
|
token = otherPages(token);
|
||||||
} catch (final DnetCollectorException e) {
|
} catch (final CollectorException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -92,7 +101,7 @@ public class OaiIterator implements Iterator<String> {
|
||||||
public void remove() {
|
public void remove() {
|
||||||
}
|
}
|
||||||
|
|
||||||
private String firstPage() throws DnetCollectorException {
|
private String firstPage() throws CollectorException {
|
||||||
try {
|
try {
|
||||||
String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8");
|
String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8");
|
||||||
if (set != null && !set.isEmpty()) {
|
if (set != null && !set.isEmpty()) {
|
||||||
|
@ -108,7 +117,8 @@ public class OaiIterator implements Iterator<String> {
|
||||||
|
|
||||||
return downloadPage(url);
|
return downloadPage(url);
|
||||||
} catch (final UnsupportedEncodingException e) {
|
} catch (final UnsupportedEncodingException e) {
|
||||||
throw new DnetCollectorException(e);
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
|
throw new CollectorException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -126,32 +136,35 @@ public class OaiIterator implements Iterator<String> {
|
||||||
return result.trim();
|
return result.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
private String otherPages(final String resumptionToken) throws DnetCollectorException {
|
private String otherPages(final String resumptionToken) throws CollectorException {
|
||||||
try {
|
try {
|
||||||
return downloadPage(
|
return downloadPage(
|
||||||
baseUrl
|
baseUrl
|
||||||
+ "?verb=ListRecords&resumptionToken="
|
+ "?verb=ListRecords&resumptionToken="
|
||||||
+ URLEncoder.encode(resumptionToken, "UTF-8"));
|
+ URLEncoder.encode(resumptionToken, "UTF-8"));
|
||||||
} catch (final UnsupportedEncodingException e) {
|
} catch (final UnsupportedEncodingException e) {
|
||||||
throw new DnetCollectorException(e);
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
|
throw new CollectorException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String downloadPage(final String url) throws DnetCollectorException {
|
private String downloadPage(final String url) throws CollectorException {
|
||||||
|
|
||||||
final String xml = httpConnector.getInputSource(url);
|
final String xml = httpConnector.getInputSource(url, report);
|
||||||
Document doc;
|
Document doc;
|
||||||
try {
|
try {
|
||||||
doc = reader.read(new StringReader(xml));
|
doc = DocumentHelper.parseText(xml);
|
||||||
} catch (final DocumentException e) {
|
} catch (final DocumentException e) {
|
||||||
log.warn("Error parsing xml, I try to clean it: " + xml, e);
|
log.warn("Error parsing xml, I try to clean it. {}", e.getMessage());
|
||||||
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
final String cleaned = XmlCleaner.cleanAllEntities(xml);
|
final String cleaned = XmlCleaner.cleanAllEntities(xml);
|
||||||
try {
|
try {
|
||||||
doc = reader.read(new StringReader(cleaned));
|
doc = DocumentHelper.parseText(xml);
|
||||||
} catch (final DocumentException e1) {
|
} catch (final DocumentException e1) {
|
||||||
final String resumptionToken = extractResumptionToken(xml);
|
final String resumptionToken = extractResumptionToken(xml);
|
||||||
if (resumptionToken == null) {
|
if (resumptionToken == null) {
|
||||||
throw new DnetCollectorException("Error parsing cleaned document:" + cleaned, e1);
|
report.put(e1.getClass().getName(), e1.getMessage());
|
||||||
|
throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1);
|
||||||
}
|
}
|
||||||
return resumptionToken;
|
return resumptionToken;
|
||||||
}
|
}
|
||||||
|
@ -159,19 +172,35 @@ public class OaiIterator implements Iterator<String> {
|
||||||
|
|
||||||
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
|
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
|
||||||
if (errorNode != null) {
|
if (errorNode != null) {
|
||||||
final String code = errorNode.valueOf("@code");
|
final String code = errorNode.valueOf("@code").trim();
|
||||||
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
|
if ("noRecordsMatch".equalsIgnoreCase(code)) {
|
||||||
log.warn("noRecordsMatch for oai call: " + url);
|
final String msg = "noRecordsMatch for oai call : " + url;
|
||||||
|
log.warn(msg);
|
||||||
|
report.put(REPORT_PREFIX + code, msg);
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
throw new DnetCollectorException(code + " - " + errorNode.getText());
|
final String msg = code + " - " + errorNode.getText();
|
||||||
|
report.put(REPORT_PREFIX + "error", msg);
|
||||||
|
throw new CollectorException(msg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
|
for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
|
||||||
queue.add(((Node) o).asXML());
|
final StringWriter sw = new StringWriter();
|
||||||
|
final XMLWriter writer = new XMLWriter(sw, OutputFormat.createPrettyPrint());
|
||||||
|
try {
|
||||||
|
writer.write((Node) o);
|
||||||
|
queue.add(sw.toString());
|
||||||
|
} catch (IOException e) {
|
||||||
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
|
throw new CollectorException("Error parsing XML record:\n" + ((Node) o).asXML(), e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return doc.valueOf("//*[local-name()='resumptionToken']");
|
return doc.valueOf("//*[local-name()='resumptionToken']");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public AggregatorReport getReport() {
|
||||||
|
return report;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,24 +3,28 @@ package eu.dnetlib.dhp.collection.plugin.oai;
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.collection.HttpClientParams;
|
||||||
|
import eu.dnetlib.dhp.collection.HttpConnector2;
|
||||||
|
|
||||||
public class OaiIteratorFactory {
|
public class OaiIteratorFactory {
|
||||||
|
|
||||||
private HttpConnector httpConnector;
|
private HttpConnector2 httpConnector;
|
||||||
|
|
||||||
public Iterator<String> newIterator(
|
public Iterator<String> newIterator(
|
||||||
final String baseUrl,
|
final String baseUrl,
|
||||||
final String mdFormat,
|
final String mdFormat,
|
||||||
final String set,
|
final String set,
|
||||||
final String fromDate,
|
final String fromDate,
|
||||||
final String untilDate) {
|
final String untilDate,
|
||||||
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector());
|
final HttpClientParams clientParams,
|
||||||
|
final AggregatorReport report) {
|
||||||
|
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(clientParams), report);
|
||||||
}
|
}
|
||||||
|
|
||||||
private HttpConnector getHttpConnector() {
|
private HttpConnector2 getHttpConnector(HttpClientParams clientParams) {
|
||||||
if (httpConnector == null)
|
if (httpConnector == null)
|
||||||
httpConnector = new HttpConnector();
|
httpConnector = new HttpConnector2(clientParams);
|
||||||
return httpConnector;
|
return httpConnector;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,105 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Spliterator;
|
||||||
|
import java.util.Spliterators;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
|
import eu.dnetlib.dhp.collection.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.collection.HttpClientParams;
|
||||||
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* TODO: delegate HTTP requests to the common HttpConnector2 implementation.
|
||||||
|
*
|
||||||
|
* @author js, Andreas Czerniak
|
||||||
|
* @date 2020-04-09
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class RestCollectorPlugin implements CollectorPlugin {
|
||||||
|
|
||||||
|
public static final String RESULT_SIZE_VALUE_DEFAULT = "100";
|
||||||
|
|
||||||
|
private HttpClientParams clientParams;
|
||||||
|
|
||||||
|
public RestCollectorPlugin(HttpClientParams clientParams) {
|
||||||
|
this.clientParams = clientParams;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
|
||||||
|
final String baseUrl = api.getBaseUrl();
|
||||||
|
|
||||||
|
final String resumptionType = api.getParams().get("resumptionType");
|
||||||
|
final String resumptionParam = api.getParams().get("resumptionParam");
|
||||||
|
final String resumptionXpath = api.getParams().get("resumptionXpath");
|
||||||
|
final String resultTotalXpath = api.getParams().get("resultTotalXpath");
|
||||||
|
final String resultFormatParam = api.getParams().get("resultFormatParam");
|
||||||
|
final String resultFormatValue = api.getParams().get("resultFormatValue");
|
||||||
|
final String resultSizeParam = api.getParams().get("resultSizeParam");
|
||||||
|
final String queryParams = api.getParams().get("queryParams");
|
||||||
|
final String entityXpath = api.getParams().get("entityXpath");
|
||||||
|
final String authMethod = api.getParams().get("authMethod");
|
||||||
|
final String authToken = api.getParams().get("authToken");
|
||||||
|
final String resultSizeValue = Optional
|
||||||
|
.ofNullable(api.getParams().get("resultSizeValue"))
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.orElse(RESULT_SIZE_VALUE_DEFAULT);
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(baseUrl)) {
|
||||||
|
throw new CollectorException("Param 'baseUrl' is null or empty");
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(resumptionType)) {
|
||||||
|
throw new CollectorException("Param 'resumptionType' is null or empty");
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(resumptionParam)) {
|
||||||
|
throw new CollectorException("Param 'resumptionParam' is null or empty");
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(resultFormatValue)) {
|
||||||
|
throw new CollectorException("Param 'resultFormatValue' is null or empty");
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(queryParams)) {
|
||||||
|
throw new CollectorException("Param 'queryParams' is null or empty");
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(entityXpath)) {
|
||||||
|
throw new CollectorException("Param 'entityXpath' is null or empty");
|
||||||
|
}
|
||||||
|
|
||||||
|
final String resultOutputFormat = Optional
|
||||||
|
.ofNullable(api.getParams().get("resultOutputFormat"))
|
||||||
|
.map(String::toLowerCase)
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.orElse(resultFormatValue.toLowerCase());
|
||||||
|
|
||||||
|
RestIterator it = new RestIterator(
|
||||||
|
getClientParams(),
|
||||||
|
baseUrl,
|
||||||
|
resumptionType,
|
||||||
|
resumptionParam,
|
||||||
|
resumptionXpath,
|
||||||
|
resultTotalXpath,
|
||||||
|
resultFormatParam,
|
||||||
|
resultFormatValue,
|
||||||
|
resultSizeParam,
|
||||||
|
resultSizeValue,
|
||||||
|
queryParams,
|
||||||
|
entityXpath,
|
||||||
|
authMethod,
|
||||||
|
authToken,
|
||||||
|
resultOutputFormat);
|
||||||
|
|
||||||
|
return StreamSupport
|
||||||
|
.stream(
|
||||||
|
Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpClientParams getClientParams() {
|
||||||
|
return clientParams;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,412 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.net.URLEncoder;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Queue;
|
||||||
|
import java.util.concurrent.PriorityBlockingQueue;
|
||||||
|
|
||||||
|
import javax.xml.transform.OutputKeys;
|
||||||
|
import javax.xml.transform.Transformer;
|
||||||
|
import javax.xml.transform.TransformerConfigurationException;
|
||||||
|
import javax.xml.transform.TransformerFactory;
|
||||||
|
import javax.xml.transform.dom.DOMSource;
|
||||||
|
import javax.xml.transform.stream.StreamResult;
|
||||||
|
import javax.xml.xpath.*;
|
||||||
|
|
||||||
|
import org.apache.avro.test.http.Http;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.http.HttpHeaders;
|
||||||
|
import org.apache.http.entity.ContentType;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.w3c.dom.Node;
|
||||||
|
import org.w3c.dom.NodeList;
|
||||||
|
import org.xml.sax.InputSource;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.collection.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.collection.HttpClientParams;
|
||||||
|
import eu.dnetlib.dhp.collection.JsonUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* log.info(...) equal to log.trace(...) in the application-logs
|
||||||
|
* <p>
|
||||||
|
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
|
||||||
|
*
|
||||||
|
* @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
|
||||||
|
* @date 2020-04-09
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class RestIterator implements Iterator<String> {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
|
||||||
|
public static final String UTF_8 = "UTF-8";
|
||||||
|
|
||||||
|
private HttpClientParams clientParams;
|
||||||
|
|
||||||
|
private final String BASIC = "basic";
|
||||||
|
|
||||||
|
private JsonUtils jsonUtils;
|
||||||
|
|
||||||
|
private String baseUrl;
|
||||||
|
private String resumptionType;
|
||||||
|
private String resumptionParam;
|
||||||
|
private String resultFormatValue;
|
||||||
|
private String queryParams;
|
||||||
|
private int resultSizeValue;
|
||||||
|
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
||||||
|
private int resultTotal = -1;
|
||||||
|
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
|
||||||
|
// or token scanned from results)
|
||||||
|
private InputStream resultStream;
|
||||||
|
private Transformer transformer;
|
||||||
|
private XPath xpath;
|
||||||
|
private String query;
|
||||||
|
private XPathExpression xprResultTotalPath;
|
||||||
|
private XPathExpression xprResumptionPath;
|
||||||
|
private XPathExpression xprEntity;
|
||||||
|
private String queryFormat;
|
||||||
|
private String querySize;
|
||||||
|
private String authMethod;
|
||||||
|
private String authToken;
|
||||||
|
private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
|
||||||
|
private int discoverResultSize = 0;
|
||||||
|
private int pagination = 1;
|
||||||
|
/*
|
||||||
|
* While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in
|
||||||
|
* json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in
|
||||||
|
* json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format.
|
||||||
|
*/
|
||||||
|
private String resultOutputFormat;
|
||||||
|
|
||||||
|
/** RestIterator class
|
||||||
|
* compatible to version 1.3.33
|
||||||
|
*/
|
||||||
|
public RestIterator(
|
||||||
|
final HttpClientParams clientParams,
|
||||||
|
final String baseUrl,
|
||||||
|
final String resumptionType,
|
||||||
|
final String resumptionParam,
|
||||||
|
final String resumptionXpath,
|
||||||
|
final String resultTotalXpath,
|
||||||
|
final String resultFormatParam,
|
||||||
|
final String resultFormatValue,
|
||||||
|
final String resultSizeParam,
|
||||||
|
final String resultSizeValueStr,
|
||||||
|
final String queryParams,
|
||||||
|
final String entityXpath,
|
||||||
|
final String authMethod,
|
||||||
|
final String authToken,
|
||||||
|
final String resultOutputFormat) {
|
||||||
|
|
||||||
|
this.clientParams = clientParams;
|
||||||
|
this.jsonUtils = new JsonUtils();
|
||||||
|
this.baseUrl = baseUrl;
|
||||||
|
this.resumptionType = resumptionType;
|
||||||
|
this.resumptionParam = resumptionParam;
|
||||||
|
this.resultFormatValue = resultFormatValue;
|
||||||
|
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
|
||||||
|
this.queryParams = queryParams;
|
||||||
|
this.authMethod = authMethod;
|
||||||
|
this.authToken = authToken;
|
||||||
|
this.resultOutputFormat = resultOutputFormat;
|
||||||
|
|
||||||
|
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
||||||
|
: "";
|
||||||
|
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
|
||||||
|
|
||||||
|
try {
|
||||||
|
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
||||||
|
}
|
||||||
|
initQueue();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
|
||||||
|
throws TransformerConfigurationException, XPathExpressionException {
|
||||||
|
transformer = TransformerFactory.newInstance().newTransformer();
|
||||||
|
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
||||||
|
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
||||||
|
xpath = XPathFactory.newInstance().newXPath();
|
||||||
|
xprResultTotalPath = xpath.compile(resultTotalXpath);
|
||||||
|
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
||||||
|
xprEntity = xpath.compile(entityXpath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initQueue() {
|
||||||
|
query = baseUrl + "?" + queryParams + querySize + queryFormat;
|
||||||
|
log.info("REST calls starting with " + query);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void disconnect() {
|
||||||
|
// TODO close inputstream
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* (non-Javadoc)
|
||||||
|
* @see java.util.Iterator#hasNext()
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
if (recordQueue.isEmpty() && query.isEmpty()) {
|
||||||
|
disconnect();
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* (non-Javadoc)
|
||||||
|
* @see java.util.Iterator#next()
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public String next() {
|
||||||
|
synchronized (recordQueue) {
|
||||||
|
while (recordQueue.isEmpty() && !query.isEmpty()) {
|
||||||
|
try {
|
||||||
|
query = downloadPage(query);
|
||||||
|
} catch (CollectorException e) {
|
||||||
|
log.debug("CollectorPlugin.next()-Exception: " + e);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return recordQueue.poll();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* download page and return nextQuery
|
||||||
|
*/
|
||||||
|
private String downloadPage(String query) throws CollectorException {
|
||||||
|
String resultJson;
|
||||||
|
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||||
|
String nextQuery = "";
|
||||||
|
String emptyXml = resultXml + "<" + JsonUtils.wrapName + "></" + JsonUtils.wrapName + ">";
|
||||||
|
Node resultNode = null;
|
||||||
|
NodeList nodeList = null;
|
||||||
|
String qUrlArgument = "";
|
||||||
|
int urlOldResumptionSize = 0;
|
||||||
|
InputStream theHttpInputStream;
|
||||||
|
|
||||||
|
// check if cursor=* is initial set otherwise add it to the queryParam URL
|
||||||
|
if (resumptionType.equalsIgnoreCase("deep-cursor")) {
|
||||||
|
log.debug("check resumptionType deep-cursor and check cursor=*?" + query);
|
||||||
|
if (!query.contains("&cursor=")) {
|
||||||
|
query += "&cursor=*";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
log.info("requestig URL [{}]", query);
|
||||||
|
|
||||||
|
URL qUrl = new URL(query);
|
||||||
|
log.debug("authMethod :" + authMethod);
|
||||||
|
if ("bearer".equalsIgnoreCase(this.authMethod)) {
|
||||||
|
log.trace("authMethod before inputStream: " + resultXml);
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||||
|
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
|
||||||
|
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
|
||||||
|
conn.setRequestMethod("GET");
|
||||||
|
theHttpInputStream = conn.getInputStream();
|
||||||
|
} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
|
||||||
|
log.trace("authMethod before inputStream: " + resultXml);
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||||
|
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
|
||||||
|
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
|
||||||
|
conn.setRequestMethod("GET");
|
||||||
|
theHttpInputStream = conn.getInputStream();
|
||||||
|
} else {
|
||||||
|
theHttpInputStream = qUrl.openStream();
|
||||||
|
}
|
||||||
|
|
||||||
|
resultStream = theHttpInputStream;
|
||||||
|
if ("json".equals(resultOutputFormat)) {
|
||||||
|
resultJson = IOUtils.toString(resultStream, UTF_8);
|
||||||
|
resultXml = jsonUtils.convertToXML(resultJson);
|
||||||
|
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
|
||||||
|
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
|
||||||
|
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
||||||
|
log.debug("nodeList.length: " + nodeList.getLength());
|
||||||
|
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||||
|
StringWriter sw = new StringWriter();
|
||||||
|
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
||||||
|
String toEnqueue = sw.toString();
|
||||||
|
if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
|
||||||
|
log.warn("The following record resulted in empty item for the feeding queue: " + resultXml);
|
||||||
|
} else {
|
||||||
|
recordQueue.add(sw.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.warn("resultXml is equal with emptyXml");
|
||||||
|
}
|
||||||
|
|
||||||
|
resumptionInt += resultSizeValue;
|
||||||
|
|
||||||
|
switch (resumptionType.toLowerCase()) {
|
||||||
|
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
||||||
|
resumptionStr = xprResumptionPath.evaluate(resultNode);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "count": // begin at one step for all records, iterate over items
|
||||||
|
resumptionStr = Integer.toString(resumptionInt);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
||||||
|
if (resultSizeValue < 2) {
|
||||||
|
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
|
||||||
|
}
|
||||||
|
qUrlArgument = qUrl.getQuery();
|
||||||
|
String[] arrayQUrlArgument = qUrlArgument.split("&");
|
||||||
|
for (String arrayUrlArgStr : arrayQUrlArgument) {
|
||||||
|
if (arrayUrlArgStr.startsWith(resumptionParam)) {
|
||||||
|
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
||||||
|
if (isInteger(resumptionKeyValue[1])) {
|
||||||
|
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
||||||
|
log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
|
||||||
|
} else {
|
||||||
|
log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (((emptyXml).equalsIgnoreCase(resultXml))
|
||||||
|
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
|
||||||
|
// resumptionStr = "";
|
||||||
|
if (nodeList != null) {
|
||||||
|
discoverResultSize += nodeList.getLength();
|
||||||
|
}
|
||||||
|
resultTotal = discoverResultSize;
|
||||||
|
} else {
|
||||||
|
resumptionStr = Integer.toString(resumptionInt);
|
||||||
|
resultTotal = resumptionInt + 1;
|
||||||
|
if (nodeList != null) {
|
||||||
|
discoverResultSize += nodeList.getLength();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.info("discoverResultSize: {}", discoverResultSize);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "pagination":
|
||||||
|
case "page": // pagination, iterate over page numbers
|
||||||
|
pagination += 1;
|
||||||
|
if (nodeList != null) {
|
||||||
|
discoverResultSize += nodeList.getLength();
|
||||||
|
} else {
|
||||||
|
resultTotal = discoverResultSize;
|
||||||
|
pagination = discoverResultSize;
|
||||||
|
}
|
||||||
|
resumptionInt = pagination;
|
||||||
|
resumptionStr = Integer.toString(resumptionInt);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
|
||||||
|
// solr)
|
||||||
|
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
|
||||||
|
// deep-cursor, Param 'resultSizeValue' is less than 2");}
|
||||||
|
|
||||||
|
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
|
||||||
|
queryParams = queryParams.replace("&cursor=*", "");
|
||||||
|
|
||||||
|
// terminating if length of nodeList is 0
|
||||||
|
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
|
||||||
|
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
|
||||||
|
} else {
|
||||||
|
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
|
||||||
|
// because the iteration is over
|
||||||
|
// real length and the
|
||||||
|
// resultSizeValue is added before
|
||||||
|
// the switch()
|
||||||
|
}
|
||||||
|
|
||||||
|
discoverResultSize = nodeList.getLength();
|
||||||
|
|
||||||
|
log
|
||||||
|
.debug(
|
||||||
|
"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
|
||||||
|
+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
|
||||||
|
|
||||||
|
break;
|
||||||
|
|
||||||
|
default: // otherwise: abort
|
||||||
|
// resultTotal = resumptionInt;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
throw new IllegalStateException("collection failed: " + e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (resultTotal == -1) {
|
||||||
|
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
|
||||||
|
if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
|
||||||
|
resultTotal += 1;
|
||||||
|
} // to correct the upper bound
|
||||||
|
log.info("resultTotal was -1 is now: " + resultTotal);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
||||||
|
}
|
||||||
|
log.debug("resultTotal: " + resultTotal);
|
||||||
|
log.debug("resInt: " + resumptionInt);
|
||||||
|
if (resumptionInt <= resultTotal) {
|
||||||
|
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
|
||||||
|
+ queryFormat;
|
||||||
|
} else {
|
||||||
|
nextQuery = "";
|
||||||
|
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
|
||||||
|
// resumptionInt and prevent a NullPointer Exception at mdStore
|
||||||
|
}
|
||||||
|
log.debug("nextQueryUrl: " + nextQuery);
|
||||||
|
return nextQuery;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isInteger(String s) {
|
||||||
|
boolean isValidInteger = false;
|
||||||
|
try {
|
||||||
|
Integer.parseInt(s);
|
||||||
|
|
||||||
|
// s is a valid integer
|
||||||
|
|
||||||
|
isValidInteger = true;
|
||||||
|
} catch (NumberFormatException ex) {
|
||||||
|
// s is not an integer
|
||||||
|
}
|
||||||
|
|
||||||
|
return isValidInteger;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Method to encode a string value using `UTF-8` encoding scheme
|
||||||
|
private String encodeValue(String value) {
|
||||||
|
try {
|
||||||
|
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
|
||||||
|
} catch (UnsupportedEncodingException ex) {
|
||||||
|
throw new RuntimeException(ex.getCause());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getResultFormatValue() {
|
||||||
|
return resultFormatValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getResultOutputFormat() {
|
||||||
|
return resultOutputFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,139 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.worker;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.net.URI;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.hadoop.io.IntWritable;
|
|
||||||
import org.apache.hadoop.io.SequenceFile;
|
|
||||||
import org.apache.hadoop.io.Text;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
|
||||||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
|
||||||
import eu.dnetlib.message.Message;
|
|
||||||
import eu.dnetlib.message.MessageManager;
|
|
||||||
import eu.dnetlib.message.MessageType;
|
|
||||||
|
|
||||||
public class DnetCollectorWorker {
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class);
|
|
||||||
|
|
||||||
private final CollectorPluginFactory collectorPluginFactory;
|
|
||||||
|
|
||||||
private final ArgumentApplicationParser argumentParser;
|
|
||||||
|
|
||||||
private final MessageManager manager;
|
|
||||||
|
|
||||||
public DnetCollectorWorker(
|
|
||||||
final CollectorPluginFactory collectorPluginFactory,
|
|
||||||
final ArgumentApplicationParser argumentParser,
|
|
||||||
final MessageManager manager)
|
|
||||||
throws DnetCollectorException {
|
|
||||||
this.collectorPluginFactory = collectorPluginFactory;
|
|
||||||
this.argumentParser = argumentParser;
|
|
||||||
this.manager = manager;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void collect() throws DnetCollectorException {
|
|
||||||
try {
|
|
||||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
|
||||||
final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class);
|
|
||||||
|
|
||||||
final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol());
|
|
||||||
|
|
||||||
final String hdfsuri = argumentParser.get("namenode");
|
|
||||||
|
|
||||||
// ====== Init HDFS File System Object
|
|
||||||
Configuration conf = new Configuration();
|
|
||||||
// Set FileSystem URI
|
|
||||||
conf.set("fs.defaultFS", hdfsuri);
|
|
||||||
// Because of Maven
|
|
||||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
|
||||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
|
||||||
|
|
||||||
System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS"));
|
|
||||||
System.setProperty("hadoop.home.dir", "/");
|
|
||||||
// Get the filesystem - HDFS
|
|
||||||
FileSystem.get(URI.create(hdfsuri), conf);
|
|
||||||
Path hdfswritepath = new Path(argumentParser.get("hdfsPath"));
|
|
||||||
|
|
||||||
log.info("Created path " + hdfswritepath.toString());
|
|
||||||
|
|
||||||
final Map<String, String> ongoingMap = new HashMap<>();
|
|
||||||
final Map<String, String> reportMap = new HashMap<>();
|
|
||||||
final AtomicInteger counter = new AtomicInteger(0);
|
|
||||||
try (SequenceFile.Writer writer = SequenceFile
|
|
||||||
.createWriter(
|
|
||||||
conf,
|
|
||||||
SequenceFile.Writer.file(hdfswritepath),
|
|
||||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
|
||||||
SequenceFile.Writer.valueClass(Text.class))) {
|
|
||||||
final IntWritable key = new IntWritable(counter.get());
|
|
||||||
final Text value = new Text();
|
|
||||||
plugin
|
|
||||||
.collect(api)
|
|
||||||
.forEach(
|
|
||||||
content -> {
|
|
||||||
key.set(counter.getAndIncrement());
|
|
||||||
value.set(content);
|
|
||||||
if (counter.get() % 10 == 0) {
|
|
||||||
try {
|
|
||||||
ongoingMap.put("ongoing", "" + counter.get());
|
|
||||||
log
|
|
||||||
.debug(
|
|
||||||
"Sending message: "
|
|
||||||
+ manager
|
|
||||||
.sendMessage(
|
|
||||||
new Message(
|
|
||||||
argumentParser.get("workflowId"),
|
|
||||||
"Collection",
|
|
||||||
MessageType.ONGOING,
|
|
||||||
ongoingMap),
|
|
||||||
argumentParser.get("rabbitOngoingQueue"),
|
|
||||||
true,
|
|
||||||
false));
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("Error on sending message ", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
writer.append(key, value);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
ongoingMap.put("ongoing", "" + counter.get());
|
|
||||||
manager
|
|
||||||
.sendMessage(
|
|
||||||
new Message(
|
|
||||||
argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap),
|
|
||||||
argumentParser.get("rabbitOngoingQueue"),
|
|
||||||
true,
|
|
||||||
false);
|
|
||||||
reportMap.put("collected", "" + counter.get());
|
|
||||||
manager
|
|
||||||
.sendMessage(
|
|
||||||
new Message(
|
|
||||||
argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
|
|
||||||
argumentParser.get("rabbitOngoingQueue"),
|
|
||||||
true,
|
|
||||||
false);
|
|
||||||
manager.close();
|
|
||||||
} catch (Throwable e) {
|
|
||||||
throw new DnetCollectorException("Error on collecting ", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,49 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.worker;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
|
||||||
import eu.dnetlib.message.MessageManager;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module
|
|
||||||
* will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector
|
|
||||||
* plugin to use and where store the data into HDFS path
|
|
||||||
*
|
|
||||||
* @author Sandro La Bruzzo
|
|
||||||
*/
|
|
||||||
public class DnetCollectorWorkerApplication {
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class);
|
|
||||||
|
|
||||||
private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory();
|
|
||||||
|
|
||||||
private static ArgumentApplicationParser argumentParser;
|
|
||||||
|
|
||||||
/** @param args */
|
|
||||||
public static void main(final String[] args) throws Exception {
|
|
||||||
|
|
||||||
argumentParser = new ArgumentApplicationParser(
|
|
||||||
IOUtils
|
|
||||||
.toString(
|
|
||||||
DnetCollectorWorker.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/collector/worker/collector_parameter.json")));
|
|
||||||
argumentParser.parseArgument(args);
|
|
||||||
log.info("hdfsPath =" + argumentParser.get("hdfsPath"));
|
|
||||||
log.info("json = " + argumentParser.get("apidescriptor"));
|
|
||||||
final MessageManager manager = new MessageManager(
|
|
||||||
argumentParser.get("rabbitHost"),
|
|
||||||
argumentParser.get("rabbitUser"),
|
|
||||||
argumentParser.get("rabbitPassword"),
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
null);
|
|
||||||
final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager);
|
|
||||||
worker.collect();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,19 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.worker.utils;
|
|
||||||
|
|
||||||
import java.util.LinkedList;
|
|
||||||
|
|
||||||
public class CollectorPluginErrorLogList extends LinkedList<String> {
|
|
||||||
|
|
||||||
private static final long serialVersionUID = -6925786561303289704L;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
String log = "";
|
|
||||||
int index = 0;
|
|
||||||
for (final String errorMessage : this) {
|
|
||||||
log += String.format("Retry #%s: %s / ", index++, errorMessage);
|
|
||||||
}
|
|
||||||
return log;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,20 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.worker.utils;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
|
|
||||||
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
|
||||||
|
|
||||||
public class CollectorPluginFactory {
|
|
||||||
|
|
||||||
public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException {
|
|
||||||
if (protocol == null)
|
|
||||||
throw new DnetCollectorException("protocol cannot be null");
|
|
||||||
switch (protocol.toLowerCase().trim()) {
|
|
||||||
case "oai":
|
|
||||||
return new OaiCollectorPlugin();
|
|
||||||
default:
|
|
||||||
throw new DnetCollectorException("UNknown protocol");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,244 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.worker.utils;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.net.*;
|
|
||||||
import java.security.GeneralSecurityException;
|
|
||||||
import java.security.cert.X509Certificate;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import javax.net.ssl.HttpsURLConnection;
|
|
||||||
import javax.net.ssl.SSLContext;
|
|
||||||
import javax.net.ssl.TrustManager;
|
|
||||||
import javax.net.ssl.X509TrustManager;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang.math.NumberUtils;
|
|
||||||
import org.apache.commons.logging.Log;
|
|
||||||
import org.apache.commons.logging.LogFactory;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
|
||||||
|
|
||||||
public class HttpConnector {
|
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
|
||||||
|
|
||||||
private int maxNumberOfRetry = 6;
|
|
||||||
private int defaultDelay = 120; // seconds
|
|
||||||
private int readTimeOut = 120; // seconds
|
|
||||||
|
|
||||||
private String responseType = null;
|
|
||||||
|
|
||||||
private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
|
||||||
|
|
||||||
public HttpConnector() {
|
|
||||||
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given the URL returns the content via HTTP GET
|
|
||||||
*
|
|
||||||
* @param requestUrl the URL
|
|
||||||
* @return the content of the downloaded resource
|
|
||||||
* @throws DnetCollectorException when retrying more than maxNumberOfRetry times
|
|
||||||
*/
|
|
||||||
public String getInputSource(final String requestUrl) throws DnetCollectorException {
|
|
||||||
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given the URL returns the content as a stream via HTTP GET
|
|
||||||
*
|
|
||||||
* @param requestUrl the URL
|
|
||||||
* @return the content of the downloaded resource as InputStream
|
|
||||||
* @throws DnetCollectorException when retrying more than maxNumberOfRetry times
|
|
||||||
*/
|
|
||||||
public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException {
|
|
||||||
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
|
||||||
}
|
|
||||||
|
|
||||||
private String attemptDownlaodAsString(
|
|
||||||
final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
|
||||||
throws DnetCollectorException {
|
|
||||||
try {
|
|
||||||
final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
|
||||||
try {
|
|
||||||
return IOUtils.toString(s);
|
|
||||||
} catch (final IOException e) {
|
|
||||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
|
||||||
Thread.sleep(defaultDelay * 1000);
|
|
||||||
errorList.add(e.getMessage());
|
|
||||||
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
|
||||||
} finally {
|
|
||||||
IOUtils.closeQuietly(s);
|
|
||||||
}
|
|
||||||
} catch (final InterruptedException e) {
|
|
||||||
throw new DnetCollectorException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private InputStream attemptDownload(
|
|
||||||
final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
|
||||||
throws DnetCollectorException {
|
|
||||||
|
|
||||||
if (retryNumber > maxNumberOfRetry) {
|
|
||||||
throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList);
|
|
||||||
}
|
|
||||||
|
|
||||||
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
|
||||||
try {
|
|
||||||
InputStream input = null;
|
|
||||||
|
|
||||||
try {
|
|
||||||
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
|
||||||
urlConn.setInstanceFollowRedirects(false);
|
|
||||||
urlConn.setReadTimeout(readTimeOut * 1000);
|
|
||||||
urlConn.addRequestProperty("User-Agent", userAgent);
|
|
||||||
|
|
||||||
if (log.isDebugEnabled()) {
|
|
||||||
logHeaderFields(urlConn);
|
|
||||||
}
|
|
||||||
|
|
||||||
final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
|
||||||
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
|
|
||||||
log.warn("waiting and repeating request after " + retryAfter + " sec.");
|
|
||||||
Thread.sleep(retryAfter * 1000);
|
|
||||||
errorList.add("503 Service Unavailable");
|
|
||||||
urlConn.disconnect();
|
|
||||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
|
||||||
} else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM
|
|
||||||
|| urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) {
|
|
||||||
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
|
||||||
log.debug("The requested url has been moved to " + newUrl);
|
|
||||||
errorList
|
|
||||||
.add(
|
|
||||||
String
|
|
||||||
.format(
|
|
||||||
"%s %s. Moved to: %s",
|
|
||||||
urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
|
|
||||||
urlConn.disconnect();
|
|
||||||
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
|
||||||
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
|
||||||
log
|
|
||||||
.error(
|
|
||||||
String
|
|
||||||
.format(
|
|
||||||
"HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
|
||||||
Thread.sleep(defaultDelay * 1000);
|
|
||||||
errorList
|
|
||||||
.add(
|
|
||||||
String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
|
||||||
urlConn.disconnect();
|
|
||||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
|
||||||
} else {
|
|
||||||
input = urlConn.getInputStream();
|
|
||||||
responseType = urlConn.getContentType();
|
|
||||||
return input;
|
|
||||||
}
|
|
||||||
} catch (final IOException e) {
|
|
||||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
|
||||||
Thread.sleep(defaultDelay * 1000);
|
|
||||||
errorList.add(e.getMessage());
|
|
||||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
|
||||||
}
|
|
||||||
} catch (final InterruptedException e) {
|
|
||||||
throw new DnetCollectorException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
|
||||||
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
|
||||||
|
|
||||||
for (final Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
|
||||||
if (e.getKey() != null) {
|
|
||||||
for (final String v : e.getValue()) {
|
|
||||||
log.debug(" key: " + e.getKey() + " - value: " + v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
|
||||||
for (final String key : headerMap.keySet()) {
|
|
||||||
if (key != null
|
|
||||||
&& key.toLowerCase().equals("retry-after")
|
|
||||||
&& headerMap.get(key).size() > 0
|
|
||||||
&& NumberUtils.isNumber(headerMap.get(key).get(0))) {
|
|
||||||
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String obtainNewLocation(final Map<String, List<String>> headerMap)
|
|
||||||
throws DnetCollectorException {
|
|
||||||
for (final String key : headerMap.keySet()) {
|
|
||||||
if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) {
|
|
||||||
return headerMap.get(key).get(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
throw new DnetCollectorException(
|
|
||||||
"The requested url has been MOVED, but 'location' param is MISSING");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
|
||||||
*/
|
|
||||||
public void initTrustManager() {
|
|
||||||
final X509TrustManager tm = new X509TrustManager() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public X509Certificate[] getAcceptedIssuers() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
try {
|
|
||||||
final SSLContext ctx = SSLContext.getInstance("TLS");
|
|
||||||
ctx.init(null, new TrustManager[] {
|
|
||||||
tm
|
|
||||||
}, null);
|
|
||||||
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
|
||||||
} catch (final GeneralSecurityException e) {
|
|
||||||
log.fatal(e);
|
|
||||||
throw new IllegalStateException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getMaxNumberOfRetry() {
|
|
||||||
return maxNumberOfRetry;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
|
||||||
this.maxNumberOfRetry = maxNumberOfRetry;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getDefaultDelay() {
|
|
||||||
return defaultDelay;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDefaultDelay(final int defaultDelay) {
|
|
||||||
this.defaultDelay = defaultDelay;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getReadTimeOut() {
|
|
||||||
return readTimeOut;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setReadTimeOut(final int readTimeOut) {
|
|
||||||
this.readTimeOut = readTimeOut;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getResponseType() {
|
|
||||||
return responseType;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.transformation;
|
||||||
|
|
||||||
|
public class DnetTransformationException extends Exception {
|
||||||
|
|
||||||
|
public DnetTransformationException() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public DnetTransformationException(
|
||||||
|
final String message,
|
||||||
|
final Throwable cause,
|
||||||
|
final boolean enableSuppression,
|
||||||
|
final boolean writableStackTrace) {
|
||||||
|
super(message, cause, enableSuppression, writableStackTrace);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DnetTransformationException(final String message, final Throwable cause) {
|
||||||
|
super(message, cause);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DnetTransformationException(final String message) {
|
||||||
|
super(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DnetTransformationException(final Throwable cause) {
|
||||||
|
super(cause);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,15 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.transformation;
|
package eu.dnetlib.dhp.transformation;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.Constants.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
import static eu.dnetlib.dhp.utils.DHPUtils.*;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.cli.*;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -17,22 +16,18 @@ import org.apache.spark.sql.Encoder;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import org.dom4j.Document;
|
|
||||||
import org.dom4j.DocumentException;
|
|
||||||
import org.dom4j.Node;
|
|
||||||
import org.dom4j.io.SAXReader;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
import eu.dnetlib.dhp.message.MessageSender;
|
||||||
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
|
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
|
||||||
import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import eu.dnetlib.message.Message;
|
|
||||||
import eu.dnetlib.message.MessageManager;
|
|
||||||
import eu.dnetlib.message.MessageType;
|
|
||||||
|
|
||||||
public class TransformSparkJobNode {
|
public class TransformSparkJobNode {
|
||||||
|
|
||||||
|
@ -55,67 +50,85 @@ public class TransformSparkJobNode {
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
final String inputPath = parser.get("input");
|
final String mdstoreInputVersion = parser.get("mdstoreInputVersion");
|
||||||
final String outputPath = parser.get("output");
|
final String mdstoreOutputVersion = parser.get("mdstoreOutputVersion");
|
||||||
final String workflowId = parser.get("workflowId");
|
|
||||||
final String trasformationRule = extractXSLTFromTR(
|
|
||||||
Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule"))));
|
|
||||||
|
|
||||||
final String rabbitUser = parser.get("rabbitUser");
|
final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class);
|
||||||
final String rabbitPassword = parser.get("rabbitPassword");
|
final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH;
|
||||||
final String rabbitHost = parser.get("rabbitHost");
|
log.info("inputPath: {}", inputPath);
|
||||||
final String rabbitReportQueue = parser.get("rabbitReportQueue");
|
|
||||||
final long dateOfCollection = new Long(parser.get("dateOfCollection"));
|
final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class);
|
||||||
final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
|
final String outputBasePath = cleanedMdStoreVersion.getHdfsPath();
|
||||||
|
log.info("outputBasePath: {}", outputBasePath);
|
||||||
|
|
||||||
|
final String isLookupUrl = parser.get("isLookupUrl");
|
||||||
|
log.info(String.format("isLookupUrl: %s", isLookupUrl));
|
||||||
|
|
||||||
|
final String dateOfTransformation = parser.get("dateOfTransformation");
|
||||||
|
log.info(String.format("dateOfTransformation: %s", dateOfTransformation));
|
||||||
|
|
||||||
|
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||||
|
|
||||||
|
final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService);
|
||||||
|
|
||||||
|
log.info("Retrieved {} vocabularies", vocabularies.vocabularyNames().size());
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
|
transformRecords(
|
||||||
final Dataset<MetadataRecord> mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder);
|
parser.getObjectMap(), isLookupService, spark, inputPath, outputBasePath);
|
||||||
final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems");
|
|
||||||
final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems");
|
|
||||||
final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems");
|
|
||||||
final Map<String, Vocabulary> vocabularies = new HashMap<>();
|
|
||||||
vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages"));
|
|
||||||
final TransformFunction transformFunction = new TransformFunction(
|
|
||||||
totalItems,
|
|
||||||
errorItems,
|
|
||||||
transformedItems,
|
|
||||||
trasformationRule,
|
|
||||||
dateOfCollection,
|
|
||||||
vocabularies);
|
|
||||||
mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath);
|
|
||||||
if (rabbitHost != null) {
|
|
||||||
System.out.println("SEND FINAL REPORT");
|
|
||||||
final Map<String, String> reportMap = new HashMap<>();
|
|
||||||
reportMap.put("inputItem", "" + totalItems.value());
|
|
||||||
reportMap.put("invalidRecords", "" + errorItems.value());
|
|
||||||
reportMap.put("mdStoreSize", "" + transformedItems.value());
|
|
||||||
System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap));
|
|
||||||
if (!test) {
|
|
||||||
final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false,
|
|
||||||
false,
|
|
||||||
null);
|
|
||||||
manager
|
|
||||||
.sendMessage(
|
|
||||||
new Message(workflowId, "Transform", MessageType.REPORT, reportMap),
|
|
||||||
rabbitReportQueue,
|
|
||||||
true,
|
|
||||||
false);
|
|
||||||
manager.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String extractXSLTFromTR(final String tr) throws DocumentException {
|
public static void transformRecords(final Map<String, String> args, final ISLookUpService isLookUpService,
|
||||||
SAXReader reader = new SAXReader();
|
final SparkSession spark, final String inputPath, final String outputBasePath)
|
||||||
Document document = reader.read(new ByteArrayInputStream(tr.getBytes()));
|
throws DnetTransformationException, IOException {
|
||||||
Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']");
|
|
||||||
return node.asXML();
|
final LongAccumulator totalItems = spark.sparkContext().longAccumulator(CONTENT_TOTALITEMS);
|
||||||
|
final LongAccumulator errorItems = spark.sparkContext().longAccumulator(CONTENT_INVALIDRECORDS);
|
||||||
|
final LongAccumulator transformedItems = spark.sparkContext().longAccumulator(CONTENT_TRANSFORMEDRECORDS);
|
||||||
|
final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems);
|
||||||
|
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
|
||||||
|
|
||||||
|
final String dnetMessageManagerURL = args.get(DNET_MESSAGE_MGR_URL);
|
||||||
|
log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL);
|
||||||
|
|
||||||
|
final String workflowId = args.get("workflowId");
|
||||||
|
log.info("workflowId is {}", workflowId);
|
||||||
|
|
||||||
|
final MessageSender messageSender = new MessageSender(dnetMessageManagerURL, workflowId);
|
||||||
|
try (AggregatorReport report = new AggregatorReport(messageSender)) {
|
||||||
|
try {
|
||||||
|
final Dataset<MetadataRecord> mdstore = spark
|
||||||
|
.read()
|
||||||
|
.format("parquet")
|
||||||
|
.load(inputPath)
|
||||||
|
.as(encoder)
|
||||||
|
.map(
|
||||||
|
TransformationFactory.getTransformationPlugin(args, ct, isLookUpService),
|
||||||
|
encoder);
|
||||||
|
saveDataset(mdstore, outputBasePath + MDSTORE_DATA_PATH);
|
||||||
|
|
||||||
|
log.info("Transformed item " + ct.getProcessedItems().count());
|
||||||
|
log.info("Total item " + ct.getTotalItems().count());
|
||||||
|
log.info("Transformation Error item " + ct.getErrorItems().count());
|
||||||
|
|
||||||
|
final long mdStoreSize = spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count();
|
||||||
|
writeHdfsFile(
|
||||||
|
spark.sparkContext().hadoopConfiguration(),
|
||||||
|
"" + mdStoreSize, outputBasePath + MDSTORE_SIZE_PATH);
|
||||||
|
} catch (Throwable e) {
|
||||||
|
log.error("error during record transformation", e);
|
||||||
|
report.put(TransformSparkJobNode.class.getSimpleName(), e.getMessage());
|
||||||
|
report.put(CONTENT_TOTALITEMS, ct.getTotalItems().value().toString());
|
||||||
|
report.put(CONTENT_INVALIDRECORDS, ct.getErrorItems().value().toString());
|
||||||
|
report.put(CONTENT_TRANSFORMEDRECORDS, ct.getProcessedItems().value().toString());
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,69 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.transformation;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
|
||||||
|
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
|
public class TransformationFactory {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class);
|
||||||
|
public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//RESOURCE_IDENTIFIER/@value = \"%s\" return $x//CODE/*[local-name() =\"stylesheet\"]";
|
||||||
|
|
||||||
|
public static MapFunction<MetadataRecord, MetadataRecord> getTransformationPlugin(
|
||||||
|
final Map<String, String> jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService)
|
||||||
|
throws DnetTransformationException {
|
||||||
|
|
||||||
|
try {
|
||||||
|
final String transformationPlugin = jobArgument.get("transformationPlugin");
|
||||||
|
|
||||||
|
log.info("Transformation plugin required " + transformationPlugin);
|
||||||
|
switch (transformationPlugin) {
|
||||||
|
case "XSLT_TRANSFORM": {
|
||||||
|
final String transformationRuleId = jobArgument.get("transformationRuleId");
|
||||||
|
if (StringUtils.isBlank(transformationRuleId))
|
||||||
|
throw new DnetTransformationException("Missing Parameter transformationRule");
|
||||||
|
final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService);
|
||||||
|
|
||||||
|
final String transformationRule = queryTransformationRuleFromIS(
|
||||||
|
transformationRuleId, isLookupService);
|
||||||
|
|
||||||
|
final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation"));
|
||||||
|
return new XSLTTransformationFunction(counters, transformationRule, dateOfTransformation,
|
||||||
|
vocabularies);
|
||||||
|
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
throw new DnetTransformationException(
|
||||||
|
"transformation plugin does not exists for " + transformationPlugin);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (Throwable e) {
|
||||||
|
throw new DnetTransformationException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String queryTransformationRuleFromIS(final String transformationRuleId,
|
||||||
|
final ISLookUpService isLookUpService) throws Exception {
|
||||||
|
final String query = String.format(TRULE_XQUERY, transformationRuleId);
|
||||||
|
System.out.println("asking query to IS: " + query);
|
||||||
|
List<String> result = isLookUpService.quickSearchProfile(query);
|
||||||
|
|
||||||
|
if (result == null || result.isEmpty())
|
||||||
|
throw new DnetTransformationException(
|
||||||
|
"Unable to find transformation rule with name: " + transformationRuleId);
|
||||||
|
return result.get(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,53 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.transformation.vocabulary;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
public class Term implements Serializable {
|
|
||||||
|
|
||||||
private String englishName;
|
|
||||||
private String nativeName;
|
|
||||||
private String encoding;
|
|
||||||
private String code;
|
|
||||||
private String synonyms;
|
|
||||||
|
|
||||||
public String getEnglishName() {
|
|
||||||
return englishName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setEnglishName(String englishName) {
|
|
||||||
this.englishName = englishName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getNativeName() {
|
|
||||||
return nativeName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setNativeName(String nativeName) {
|
|
||||||
this.nativeName = nativeName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getEncoding() {
|
|
||||||
return encoding;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setEncoding(String encoding) {
|
|
||||||
this.encoding = encoding;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getCode() {
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCode(String code) {
|
|
||||||
this.code = code;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getSynonyms() {
|
|
||||||
return synonyms;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSynonyms(String synonyms) {
|
|
||||||
this.synonyms = synonyms;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,54 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.transformation.vocabulary;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class Vocabulary implements Serializable {
|
|
||||||
|
|
||||||
private String id;
|
|
||||||
private String name;
|
|
||||||
private String description;
|
|
||||||
private String code;
|
|
||||||
private List<Term> terms;
|
|
||||||
|
|
||||||
public String getId() {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setId(String id) {
|
|
||||||
this.id = id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getName() {
|
|
||||||
return name;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setName(String name) {
|
|
||||||
this.name = name;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getDescription() {
|
|
||||||
return description;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDescription(String description) {
|
|
||||||
this.description = description;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getCode() {
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCode(String code) {
|
|
||||||
this.code = code;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Term> getTerms() {
|
|
||||||
return terms;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTerms(List<Term> terms) {
|
|
||||||
this.terms = terms;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,24 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.transformation.vocabulary;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
public class VocabularyHelper implements Serializable {
|
|
||||||
|
|
||||||
private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json";
|
|
||||||
|
|
||||||
public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception {
|
|
||||||
final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName));
|
|
||||||
|
|
||||||
final String response = IOUtils.toString(url, Charset.defaultCharset());
|
|
||||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
|
||||||
final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class);
|
|
||||||
return vocabulary;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,25 +1,25 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.transformation.functions;
|
package eu.dnetlib.dhp.transformation.xslt;
|
||||||
|
|
||||||
import java.util.Map;
|
import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.transformation.vocabulary.Term;
|
import java.io.Serializable;
|
||||||
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
|
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import net.sf.saxon.s9api.*;
|
import net.sf.saxon.s9api.*;
|
||||||
import scala.Serializable;
|
|
||||||
|
|
||||||
public class Cleaner implements ExtensionFunction, Serializable {
|
public class Cleaner implements ExtensionFunction, Serializable {
|
||||||
|
|
||||||
private final Map<String, Vocabulary> vocabularies;
|
private final VocabularyGroup vocabularies;
|
||||||
|
|
||||||
public Cleaner(Map<String, Vocabulary> vocabularies) {
|
public Cleaner(final VocabularyGroup vocabularies) {
|
||||||
this.vocabularies = vocabularies;
|
this.vocabularies = vocabularies;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public QName getName() {
|
public QName getName() {
|
||||||
return new QName("http://eu/dnetlib/trasform/extension", "clean");
|
return new QName(QNAME_BASE_URI + "/clean", "clean");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -30,23 +30,22 @@ public class Cleaner implements ExtensionFunction, Serializable {
|
||||||
@Override
|
@Override
|
||||||
public SequenceType[] getArgumentTypes() {
|
public SequenceType[] getArgumentTypes() {
|
||||||
return new SequenceType[] {
|
return new SequenceType[] {
|
||||||
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE),
|
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_MORE),
|
||||||
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE)
|
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
|
public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
|
||||||
|
XdmValue r = xdmValues[0];
|
||||||
|
if (r.size() == 0) {
|
||||||
|
return new XdmAtomicValue("");
|
||||||
|
}
|
||||||
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
||||||
final String vocabularyName = xdmValues[1].itemAt(0).getStringValue();
|
final String vocabularyName = xdmValues[1].itemAt(0).getStringValue();
|
||||||
Optional<Term> cleanedValue = vocabularies
|
Qualifier cleanedValue = vocabularies.getSynonymAsQualifier(vocabularyName, currentValue);
|
||||||
.get(vocabularyName)
|
|
||||||
.getTerms()
|
|
||||||
.stream()
|
|
||||||
.filter(it -> it.getNativeName().equalsIgnoreCase(currentValue))
|
|
||||||
.findAny();
|
|
||||||
|
|
||||||
return new XdmAtomicValue(
|
return new XdmAtomicValue(
|
||||||
cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue);
|
cleanedValue != null ? cleanedValue.getClassid() : currentValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,120 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.transformation.xslt;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.time.format.DateTimeFormatter;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import net.sf.saxon.s9api.*;
|
||||||
|
|
||||||
|
public class DateCleaner implements ExtensionFunction, Serializable {
|
||||||
|
|
||||||
|
private final static List<Pattern> dateRegex = Arrays
|
||||||
|
.asList(
|
||||||
|
// Y-M-D
|
||||||
|
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||||
|
// M-D-Y
|
||||||
|
Pattern
|
||||||
|
.compile(
|
||||||
|
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
||||||
|
Pattern.MULTILINE),
|
||||||
|
// D-M-Y
|
||||||
|
Pattern
|
||||||
|
.compile(
|
||||||
|
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
||||||
|
Pattern.MULTILINE),
|
||||||
|
// Y
|
||||||
|
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE));
|
||||||
|
|
||||||
|
private final static Pattern incompleteDateRegex = Pattern
|
||||||
|
.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE);
|
||||||
|
|
||||||
|
private final static List<DateTimeFormatter> dformats = Arrays
|
||||||
|
.asList(
|
||||||
|
DateTimeFormatter
|
||||||
|
.ofPattern(
|
||||||
|
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
||||||
|
Locale.ENGLISH),
|
||||||
|
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN));
|
||||||
|
|
||||||
|
public String clean(final String inputDate) {
|
||||||
|
|
||||||
|
Optional<String> cleanedDate = dateRegex
|
||||||
|
.stream()
|
||||||
|
.map(
|
||||||
|
p -> {
|
||||||
|
final Matcher matcher = p.matcher(inputDate);
|
||||||
|
if (matcher.find())
|
||||||
|
return matcher.group(0);
|
||||||
|
else
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.map(m -> {
|
||||||
|
Optional<String> cleanDate = dformats
|
||||||
|
.stream()
|
||||||
|
.map(f -> {
|
||||||
|
try {
|
||||||
|
LocalDate parsedDate = LocalDate.parse(m, f);
|
||||||
|
if (parsedDate != null)
|
||||||
|
return parsedDate.toString();
|
||||||
|
else
|
||||||
|
return null;
|
||||||
|
} catch (Throwable e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.findAny();
|
||||||
|
|
||||||
|
return cleanDate.orElse(null);
|
||||||
|
})
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.findAny();
|
||||||
|
|
||||||
|
if (cleanedDate.isPresent())
|
||||||
|
return cleanedDate.get();
|
||||||
|
|
||||||
|
final Matcher matcher = incompleteDateRegex.matcher(inputDate);
|
||||||
|
if (matcher.find()) {
|
||||||
|
final Integer year = Integer.parseInt(matcher.group(1));
|
||||||
|
final Integer month = Integer.parseInt(matcher.group(4) == null ? "01" : matcher.group(4));
|
||||||
|
return String.format("%d-%02d-01", year, month);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public QName getName() {
|
||||||
|
return new QName(QNAME_BASE_URI + "/dateISO", "dateISO");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SequenceType getResultType() {
|
||||||
|
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SequenceType[] getArgumentTypes() {
|
||||||
|
return new SequenceType[] {
|
||||||
|
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
|
||||||
|
XdmValue r = xdmValues[0];
|
||||||
|
if (r.size() == 0) {
|
||||||
|
return new XdmAtomicValue("");
|
||||||
|
}
|
||||||
|
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
||||||
|
return new XdmAtomicValue(clean(currentValue));
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,41 +1,39 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.transformation;
|
package eu.dnetlib.dhp.transformation.xslt;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.util.Map;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
import javax.xml.transform.stream.StreamSource;
|
import javax.xml.transform.stream.StreamSource;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.util.LongAccumulator;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
||||||
import eu.dnetlib.dhp.transformation.functions.Cleaner;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
|
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
|
||||||
import net.sf.saxon.s9api.*;
|
import net.sf.saxon.s9api.*;
|
||||||
|
|
||||||
public class TransformFunction implements MapFunction<MetadataRecord, MetadataRecord> {
|
public class XSLTTransformationFunction implements MapFunction<MetadataRecord, MetadataRecord> {
|
||||||
|
|
||||||
|
public final static String QNAME_BASE_URI = "http://eu/dnetlib/transform";
|
||||||
|
|
||||||
|
private final AggregationCounter aggregationCounter;
|
||||||
|
|
||||||
private final LongAccumulator totalItems;
|
|
||||||
private final LongAccumulator errorItems;
|
|
||||||
private final LongAccumulator transformedItems;
|
|
||||||
private final String transformationRule;
|
private final String transformationRule;
|
||||||
|
|
||||||
private final Cleaner cleanFunction;
|
private final Cleaner cleanFunction;
|
||||||
|
|
||||||
private final long dateOfTransformation;
|
private final long dateOfTransformation;
|
||||||
|
|
||||||
public TransformFunction(
|
public XSLTTransformationFunction(
|
||||||
LongAccumulator totalItems,
|
final AggregationCounter aggregationCounter,
|
||||||
LongAccumulator errorItems,
|
|
||||||
LongAccumulator transformedItems,
|
|
||||||
final String transformationRule,
|
final String transformationRule,
|
||||||
long dateOfTransformation,
|
long dateOfTransformation,
|
||||||
final Map<String, Vocabulary> vocabularies)
|
final VocabularyGroup vocabularies)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
this.totalItems = totalItems;
|
this.aggregationCounter = aggregationCounter;
|
||||||
this.errorItems = errorItems;
|
|
||||||
this.transformedItems = transformedItems;
|
|
||||||
this.transformationRule = transformationRule;
|
this.transformationRule = transformationRule;
|
||||||
this.dateOfTransformation = dateOfTransformation;
|
this.dateOfTransformation = dateOfTransformation;
|
||||||
cleanFunction = new Cleaner(vocabularies);
|
cleanFunction = new Cleaner(vocabularies);
|
||||||
|
@ -43,32 +41,35 @@ public class TransformFunction implements MapFunction<MetadataRecord, MetadataRe
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public MetadataRecord call(MetadataRecord value) {
|
public MetadataRecord call(MetadataRecord value) {
|
||||||
totalItems.add(1);
|
aggregationCounter.getTotalItems().add(1);
|
||||||
try {
|
try {
|
||||||
Processor processor = new Processor(false);
|
Processor processor = new Processor(false);
|
||||||
processor.registerExtensionFunction(cleanFunction);
|
processor.registerExtensionFunction(cleanFunction);
|
||||||
|
processor.registerExtensionFunction(new DateCleaner());
|
||||||
|
|
||||||
final XsltCompiler comp = processor.newXsltCompiler();
|
final XsltCompiler comp = processor.newXsltCompiler();
|
||||||
XsltExecutable xslt = comp
|
XsltExecutable xslt = comp
|
||||||
.compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes())));
|
.compile(new StreamSource(IOUtils.toInputStream(transformationRule, StandardCharsets.UTF_8)));
|
||||||
XdmNode source = processor
|
XdmNode source = processor
|
||||||
.newDocumentBuilder()
|
.newDocumentBuilder()
|
||||||
.build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes())));
|
.build(new StreamSource(IOUtils.toInputStream(value.getBody(), StandardCharsets.UTF_8)));
|
||||||
XsltTransformer trans = xslt.load();
|
XsltTransformer trans = xslt.load();
|
||||||
trans.setInitialContextNode(source);
|
trans.setInitialContextNode(source);
|
||||||
final StringWriter output = new StringWriter();
|
final StringWriter output = new StringWriter();
|
||||||
Serializer out = processor.newSerializer(output);
|
Serializer out = processor.newSerializer(output);
|
||||||
out.setOutputProperty(Serializer.Property.METHOD, "xml");
|
out.setOutputProperty(Serializer.Property.METHOD, "xml");
|
||||||
out.setOutputProperty(Serializer.Property.INDENT, "yes");
|
out.setOutputProperty(Serializer.Property.INDENT, "yes");
|
||||||
|
|
||||||
trans.setDestination(out);
|
trans.setDestination(out);
|
||||||
trans.transform();
|
trans.transform();
|
||||||
final String xml = output.toString();
|
final String xml = output.toString();
|
||||||
value.setBody(xml);
|
value.setBody(xml);
|
||||||
value.setDateOfTransformation(dateOfTransformation);
|
value.setDateOfTransformation(dateOfTransformation);
|
||||||
transformedItems.add(1);
|
aggregationCounter.getProcessedItems().add(1);
|
||||||
return value;
|
return value;
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
errorItems.add(1);
|
aggregationCounter.getErrorItems().add(1);
|
||||||
return null;
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
TUBYDI - Assistir Filmes e Series Online Grátis
|
||||||
|
123Movies
|
||||||
|
WATCH FULL MOVIE
|
||||||
|
Movierulz
|
||||||
|
Full Movie Online
|
||||||
|
MOVIé WatcH
|
||||||
|
The King of Staten Island 2020 Online For Free
|
||||||
|
Watch Train to Busan 2 2020 online for free
|
||||||
|
Sixth Sense Movie Novelization
|
||||||
|
Film Complet streaming vf gratuit en ligne
|
||||||
|
watch now free
|
||||||
|
LIVE stream watch
|
||||||
|
LIVE stream UFC
|
||||||
|
RBC Heritage live stream
|
||||||
|
MLBStreams Free
|
||||||
|
NFL Live Stream
|
||||||
|
Live Stream Free
|
||||||
|
Royal Ascot 2020 Live Stream
|
||||||
|
TV Shows Full Episodes Official
|
||||||
|
FuboTV
|
||||||
|
Gomovies
|
||||||
|
Online Free Trial Access
|
||||||
|
123watch
|
||||||
|
DÜŞÜK HAPI
|
||||||
|
Bebek Düşürme Yöntemleri
|
||||||
|
WHATSAP İLETİŞİM
|
||||||
|
Cytotec
|
||||||
|
düşük hapı
|
|
@ -0,0 +1,21 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the source mdstore path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName": "t",
|
||||||
|
"paramLongName": "targetPath",
|
||||||
|
"paramDescription": "the target mdstore path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "m",
|
||||||
|
"paramLongName": "master",
|
||||||
|
"paramDescription": "the master name",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,33 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the source mdstore path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName": "t",
|
||||||
|
"paramLongName": "targetPath",
|
||||||
|
"paramDescription": "the target mdstore path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "tr",
|
||||||
|
"paramLongName": "transformationRule",
|
||||||
|
"paramDescription": "the transformation Rule",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "m",
|
||||||
|
"paramLongName": "master",
|
||||||
|
"paramDescription": "the master name",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "i",
|
||||||
|
"paramLongName": "isLookupUrl",
|
||||||
|
"paramDescription": "the isLookup URL",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,27 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "t",
|
||||||
|
"paramLongName": "targetPath",
|
||||||
|
"paramDescription": "the path of the sequencial file to write",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName": "d",
|
||||||
|
"paramLongName": "dataciteDumpPath",
|
||||||
|
"paramDescription": "the path of the Datacite dump",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "n",
|
||||||
|
"paramLongName": "namenode",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "m",
|
||||||
|
"paramLongName": "master",
|
||||||
|
"paramDescription": "the master name",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,23 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,123 @@
|
||||||
|
<workflow-app name="Transformation_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>mdstoreInputPath</name>
|
||||||
|
<description>the path of the input MDStore</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>mdstoreOutputPath</name>
|
||||||
|
<description>the path of the cleaned mdstore</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nativeInputPath</name>
|
||||||
|
<description>the path of the input MDStore</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="resume_from"/>
|
||||||
|
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<decision name="resume_from">
|
||||||
|
<switch>
|
||||||
|
<case to="TransformJob">${wf:conf('resumeFrom') eq 'TransformJob'}</case>
|
||||||
|
<case to="ExportDataset">${wf:conf('resumeFrom') eq 'ExportDataset'}</case>
|
||||||
|
<default to="ImportDatacite"/>
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="ImportDatacite">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ImportDatacite</name>
|
||||||
|
<class>eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite</class>
|
||||||
|
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-t</arg><arg>${nativeInputPath}</arg>
|
||||||
|
<arg>-d</arg><arg>${mdstoreInputPath}</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="TransformJob"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="TransformJob">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>TransformJob</name>
|
||||||
|
<class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
|
||||||
|
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${mdstoreInputPath}</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${mdstoreOutputPath}</arg>
|
||||||
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>-tr</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="DeletePathIfExists"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="DeletePathIfExists">
|
||||||
|
<fs>
|
||||||
|
<delete path='${mdstoreOutputPath}_raw_AS'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="ExportDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="ExportDataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ExportDataset</name>
|
||||||
|
<class>eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode</class>
|
||||||
|
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${mdstoreOutputPath}</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${mdstoreOutputPath}_raw_AS</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -1,86 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"paramName": "issm",
|
|
||||||
"paramLongName": "isSparkSessionManaged",
|
|
||||||
"paramDescription": "when true will stop SparkSession after job execution",
|
|
||||||
"paramRequired": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "e",
|
|
||||||
"paramLongName": "encoding",
|
|
||||||
"paramDescription": "the encoding of the input record should be JSON or XML",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "d",
|
|
||||||
"paramLongName": "dateOfCollection",
|
|
||||||
"paramDescription": "the date when the record has been stored",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "p",
|
|
||||||
"paramLongName": "provenance",
|
|
||||||
"paramDescription": "the infos about the provenance of the collected records",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "x",
|
|
||||||
"paramLongName": "xpath",
|
|
||||||
"paramDescription": "the xpath to identify the record identifier",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "i",
|
|
||||||
"paramLongName": "input",
|
|
||||||
"paramDescription": "the path of the sequencial file to read",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "o",
|
|
||||||
"paramLongName": "output",
|
|
||||||
"paramDescription": "the path of the result DataFrame on HDFS",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "ru",
|
|
||||||
"paramLongName": "rabbitUser",
|
|
||||||
"paramDescription": "the user to connect with RabbitMq for messaging",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "rp",
|
|
||||||
"paramLongName": "rabbitPassword",
|
|
||||||
"paramDescription": "the password to connect with RabbitMq for messaging",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "rh",
|
|
||||||
"paramLongName": "rabbitHost",
|
|
||||||
"paramDescription": "the host of the RabbitMq server",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "ro",
|
|
||||||
"paramLongName": "rabbitOngoingQueue",
|
|
||||||
"paramDescription": "the name of the ongoing queue",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "rr",
|
|
||||||
"paramLongName": "rabbitReportQueue",
|
|
||||||
"paramDescription": "the name of the report queue",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "w",
|
|
||||||
"paramLongName": "workflowId",
|
|
||||||
"paramDescription": "the identifier of the dnet Workflow",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "t",
|
|
||||||
"paramLongName": "isTest",
|
|
||||||
"paramDescription": "the name of the report queue",
|
|
||||||
"paramRequired": false
|
|
||||||
}
|
|
||||||
]
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "a",
|
||||||
|
"paramLongName": "apidescriptor",
|
||||||
|
"paramDescription": "the JSON encoding of the API Descriptor",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "n",
|
||||||
|
"paramLongName": "namenode",
|
||||||
|
"paramDescription": "the Name Node URI",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "mv",
|
||||||
|
"paramLongName": "mdStoreVersion",
|
||||||
|
"paramDescription": "the MDStore Version bean",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "dm",
|
||||||
|
"paramLongName": "dnetMessageManagerURL",
|
||||||
|
"paramDescription": "the End point URL to send Messages",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "w",
|
||||||
|
"paramLongName": "workflowId",
|
||||||
|
"paramDescription": "the identifier of the dnet Workflow",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "mnr",
|
||||||
|
"paramLongName": "maxNumberOfRetry",
|
||||||
|
"paramDescription": "the maximum number of admitted connection retries",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rqd",
|
||||||
|
"paramLongName": "requestDelay",
|
||||||
|
"paramDescription": "the delay (ms) between requests",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rtd",
|
||||||
|
"paramLongName": "retryDelay",
|
||||||
|
"paramDescription": "the delay (ms) between retries",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "cto",
|
||||||
|
"paramLongName": "connectTimeOut",
|
||||||
|
"paramDescription": "the maximum allowed time (ms) to connect to the remote host",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rto",
|
||||||
|
"paramLongName": "readTimeOut",
|
||||||
|
"paramDescription": "the maximum allowed time (ms) to receive content from the remote host",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,50 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "issm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "when true will stop SparkSession after job execution",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "e",
|
||||||
|
"paramLongName": "encoding",
|
||||||
|
"paramDescription": "the encoding of the input record should be JSON or XML",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "d",
|
||||||
|
"paramLongName": "dateOfCollection",
|
||||||
|
"paramDescription": "the date when the record has been stored",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "provenance",
|
||||||
|
"paramDescription": "the infos about the provenance of the collected records",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "x",
|
||||||
|
"paramLongName": "xpath",
|
||||||
|
"paramDescription": "the xpath to identify the record identifier",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "mv",
|
||||||
|
"paramLongName": "mdStoreVersion",
|
||||||
|
"paramDescription": "the Metadata Store Version Info",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rmv",
|
||||||
|
"paramLongName": "readMdStoreVersion",
|
||||||
|
"paramDescription": "the Read Lock Metadata Store Version bean",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "w",
|
||||||
|
"paramLongName": "workflowId",
|
||||||
|
"paramDescription": "the identifier of the dnet Workflow",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,45 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "a",
|
||||||
|
"paramLongName": "action",
|
||||||
|
"paramDescription": "the JSON encoding of the API Descriptor",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "mu",
|
||||||
|
"paramLongName": "mdStoreManagerURI",
|
||||||
|
"paramDescription": "the MDStore Manager URI",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "mi",
|
||||||
|
"paramLongName": "mdStoreID",
|
||||||
|
"paramDescription": "the Metadata Store ID",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ms",
|
||||||
|
"paramLongName": "mdStoreSize",
|
||||||
|
"paramDescription": "the Metadata Store Size",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "mv",
|
||||||
|
"paramLongName": "mdStoreVersion",
|
||||||
|
"paramDescription": "the Metadata Version Bean",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "n",
|
||||||
|
"paramLongName": "namenode",
|
||||||
|
"paramDescription": "the Name Node URI",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rm",
|
||||||
|
"paramLongName": "readMDStoreId",
|
||||||
|
"paramDescription": "the ID Locked to Read",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
|
@ -0,0 +1,22 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -1,112 +1,212 @@
|
||||||
<workflow-app name="CollectionWorkflow" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="CollectionWorkflow" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
|
||||||
<name>sequenceFilePath</name>
|
|
||||||
<description>the path to store the sequence file of the native metadata collected</description>
|
|
||||||
</property>
|
|
||||||
|
|
||||||
<property>
|
|
||||||
<name>mdStorePath</name>
|
|
||||||
<description>the path of the native mdstore</description>
|
|
||||||
</property>
|
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>apiDescription</name>
|
<name>apiDescription</name>
|
||||||
<description>A json encoding of the API Description class</description>
|
<description>A json encoding of the API Description class</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>dataSourceInfo</name>
|
<name>dataSourceInfo</name>
|
||||||
<description>A json encoding of the Datasource Info</description>
|
<description>A json encoding of the Datasource Info</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>identifierPath</name>
|
<name>identifierPath</name>
|
||||||
<description>An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier </description>
|
<description>An xpath to retrieve the metadata identifier for the generation of DNet Identifier </description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>metadataEncoding</name>
|
<name>metadataEncoding</name>
|
||||||
<description> The type of the metadata XML/JSON</description>
|
<description> The type of the metadata XML/JSON</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>timestamp</name>
|
<name>timestamp</name>
|
||||||
<description>The timestamp of the collection date</description>
|
<description>The timestamp of the collection date</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>workflowId</name>
|
<name>workflowId</name>
|
||||||
<description>The identifier of the workflow</description>
|
<description>The identifier of the workflow</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>mdStoreID</name>
|
||||||
|
<description>The identifier of the mdStore</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>mdStoreManagerURI</name>
|
||||||
|
<description>The URI of the MDStore Manager</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>dnetMessageManagerURL</name>
|
||||||
|
<description>The URI of the Dnet Message Manager</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>collectionMode</name>
|
||||||
|
<description>Should be REFRESH or INCREMENTAL</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>collection_java_xmx</name>
|
||||||
|
<value>-Xmx200m</value>
|
||||||
|
<description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="DeleteMDStoresNative"/>
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="collection_mode"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
<action name="DeleteMDStoresNative">
|
|
||||||
<fs>
|
<decision name="collection_mode">
|
||||||
<mkdir path='${sequenceFilePath}'/>
|
<switch>
|
||||||
<mkdir path='${mdStorePath}'/>
|
<case to="StartTransaction">${wf:conf('collectionMode') eq 'REFRESH'}</case>
|
||||||
<delete path='${sequenceFilePath}'/>
|
<case to="BeginRead">${wf:conf('collectionMode') eq 'INCREMENTAL'}</case>
|
||||||
<delete path='${mdStorePath}'/>
|
<default to="StartTransaction"/>
|
||||||
</fs>
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="BeginRead">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<java-opts>${collection_java_xmx}</java-opts>
|
||||||
|
<arg>--action</arg><arg>READ_LOCK</arg>
|
||||||
|
<arg>--mdStoreID</arg><arg>${mdStoreID}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
<capture-output/>
|
||||||
|
</java>
|
||||||
|
<ok to="StartTransaction"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="StartTransaction">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<java-opts>${collection_java_xmx}</java-opts>
|
||||||
|
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||||
|
<arg>--mdStoreID</arg><arg>${mdStoreID}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
<capture-output/>
|
||||||
|
</java>
|
||||||
<ok to="CollectionWorker"/>
|
<ok to="CollectionWorker"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="CollectionWorker">
|
<action name="CollectionWorker">
|
||||||
<java>
|
<java>
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<main-class>eu.dnetlib.dhp.collection.CollectorWorkerApplication</main-class>
|
||||||
<name-node>${nameNode}</name-node>
|
<java-opts>${collection_java_xmx}</java-opts>
|
||||||
<main-class>eu.dnetlib.dhp.collection.worker.DnetCollectorWorker</main-class>
|
<arg>--apidescriptor</arg><arg>${apiDescription}</arg>
|
||||||
<java-opts>-p</java-opts><java-opts>${sequenceFilePath}</java-opts>
|
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||||
<java-opts>-a</java-opts><java-opts>${apiDescription}</java-opts>
|
<arg>--workflowId</arg><arg>${workflowId}</arg>
|
||||||
<java-opts>-n</java-opts><java-opts>${nameNode}</java-opts>
|
<arg>--dnetMessageManagerURL</arg><arg>${dnetMessageManagerURL}</arg>
|
||||||
<java-opts>-rh</java-opts><java-opts>${rmq_host}</java-opts>
|
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
<java-opts>-ru</java-opts><java-opts>${rmq_user}</java-opts>
|
<arg>--maxNumberOfRetry</arg><arg>${maxNumberOfRetry}</arg>
|
||||||
<java-opts>-rp</java-opts><java-opts>${rmq_pwd}</java-opts>
|
<arg>--requestDelay</arg><arg>${requestDelay}</arg>
|
||||||
<java-opts>-rr</java-opts><java-opts>${rmq_report}</java-opts>
|
<arg>--retryDelay</arg><arg>${retryDelay}</arg>
|
||||||
<java-opts>-ro</java-opts><java-opts>${rmq_ongoing}</java-opts>
|
<arg>--connectTimeOut</arg><arg>${connectTimeOut}</arg>
|
||||||
<java-opts>-u</java-opts><java-opts>sandro.labruzzo</java-opts>
|
<arg>--readTimeOut</arg><arg>${readTimeOut}</arg>
|
||||||
<java-opts>-w</java-opts><java-opts>${workflowId}</java-opts>
|
|
||||||
</java>
|
</java>
|
||||||
<ok to="GenerateNativeStoreSparkJob"/>
|
<ok to="GenerateNativeStoreSparkJob"/>
|
||||||
<error to="Kill"/>
|
<error to="FailCollection"/>
|
||||||
</action>
|
|
||||||
<action name="GenerateNativeStoreSparkJob">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>GenerateNativeStoreSparkJob</name>
|
|
||||||
<class>eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob</class>
|
|
||||||
<jar>dhp-aggregations-1.0.0-SNAPSHOT.jar</jar>
|
|
||||||
<spark-opts>--num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"</spark-opts>
|
|
||||||
<arg>--encoding</arg> <arg>${metadataEncoding}</arg>
|
|
||||||
<arg>--dateOfCollection</arg> <arg>${timestamp}</arg>
|
|
||||||
<arg>--provenance</arg> <arg> ${dataSourceInfo}</arg>
|
|
||||||
<arg>--xpath</arg><arg>${identifierPath}</arg>
|
|
||||||
<arg>--input</arg><arg>${sequenceFilePath}</arg>
|
|
||||||
<arg>--output</arg><arg>${mdStorePath}</arg>
|
|
||||||
<arg>-rh</arg><arg>${rmq_host}</arg>
|
|
||||||
<arg>-ru</arg><arg>${rmq_user}</arg>
|
|
||||||
<arg>-rp</arg><arg>${rmq_pwd}</arg>
|
|
||||||
<arg>-rr</arg><arg>${rmq_report}</arg>
|
|
||||||
<arg>-ro</arg><arg>${rmq_ongoing}</arg>
|
|
||||||
<arg>-w</arg><arg>${workflowId}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="DropInvalidStore"/>
|
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="DropInvalidStore">
|
<action name="GenerateNativeStoreSparkJob">
|
||||||
<fs>
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<delete path='${mdStorePath}/../'/>
|
<master>yarn</master>
|
||||||
</fs>
|
<mode>cluster</mode>
|
||||||
|
<name>Generate Native MetadataStore</name>
|
||||||
|
<class>eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob</class>
|
||||||
|
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--encoding</arg><arg>${metadataEncoding}</arg>
|
||||||
|
<arg>--dateOfCollection</arg><arg>${timestamp}</arg>
|
||||||
|
<arg>--provenance</arg><arg>${dataSourceInfo}</arg>
|
||||||
|
<arg>--xpath</arg><arg>${identifierPath}</arg>
|
||||||
|
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
|
<arg>--readMdStoreVersion</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="collection_mode_end"/>
|
||||||
|
<error to="FailCollection"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="collection_mode_end">
|
||||||
|
<switch>
|
||||||
|
<case to="CommitVersion">${wf:conf('collectionMode') eq 'REFRESH'}</case>
|
||||||
|
<case to="EndRead">${wf:conf('collectionMode') eq 'INCREMENTAL'}</case>
|
||||||
|
<default to="CommitVersion"/>
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="EndRead">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<java-opts>${collection_java_xmx}</java-opts>
|
||||||
|
<arg>--action</arg><arg>READ_UNLOCK</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="CommitVersion"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="CommitVersion">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<java-opts>${collection_java_xmx}</java-opts>
|
||||||
|
<arg>--action</arg><arg>COMMIT</arg>
|
||||||
|
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="FailCollection">
|
||||||
|
<switch>
|
||||||
|
<case to="RollBack">${wf:conf('collectionMode') eq 'REFRESH'}</case>
|
||||||
|
<case to="EndReadRollBack">${wf:conf('collectionMode') eq 'INCREMENTAL'}</case>
|
||||||
|
<default to="RollBack"/>
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="EndReadRollBack">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<java-opts>${collection_java_xmx}</java-opts>
|
||||||
|
<arg>--action</arg><arg>READ_UNLOCK</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="RollBack"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="RollBack">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<java-opts>${collection_java_xmx}</java-opts>
|
||||||
|
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||||
|
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
</java>
|
||||||
<ok to="Kill"/>
|
<ok to="Kill"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -1,12 +0,0 @@
|
||||||
[
|
|
||||||
{"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true},
|
|
||||||
{"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true},
|
|
||||||
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true},
|
|
||||||
{"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true},
|
|
||||||
{"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true},
|
|
||||||
{"paramName":"rp", "paramLongName":"rabbitPassword", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true},
|
|
||||||
{"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true},
|
|
||||||
{"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true},
|
|
||||||
{"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true},
|
|
||||||
{"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true}
|
|
||||||
]
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
</configuration>
|
|
@ -1,76 +1,187 @@
|
||||||
<workflow-app name="Transformation_Workflow" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Transformation_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>mdstoreInputPath</name>
|
<name>mdStoreInputId</name>
|
||||||
<description>the path of the input MDStore</description>
|
<description>the identifier of the native MDStore</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>mdstoreOutputPath</name>
|
<name>mdStoreOutputId</name>
|
||||||
|
<description>the identifier of the cleaned MDStore</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>mdStoreManagerURI</name>
|
||||||
<description>the path of the cleaned mdstore</description>
|
<description>the path of the cleaned mdstore</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>transformationRule</name>
|
<name>transformationRuleId</name>
|
||||||
<description>The transformation Rule to apply</description>
|
<description>The transformation Rule to apply</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>timestamp</name>
|
<name>transformationPlugin</name>
|
||||||
<description>The timestamp of the collection date</description>
|
<value>XSLT_TRANSFORM</value>
|
||||||
|
<description>The transformation Plugin</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>dateOfTransformation</name>
|
||||||
|
<description>The timestamp of the transformation date</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>isLookupUrl</name>
|
||||||
|
<description>The IS lookUp service endopoint</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>workflowId</name>
|
<name>workflowId</name>
|
||||||
<description>The identifier of the workflow</description>
|
<description>The identifier of the workflow</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>dnetMessageManagerURL</name>
|
||||||
|
<description>The URI of the Dnet Message Manager</description>
|
||||||
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="DeletePathIfExists"/>
|
<start to="BeginRead"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
<action name="DeletePathIfExists">
|
|
||||||
<fs>
|
<action name="BeginRead">
|
||||||
<mkdir path='${mdstoreOutputPath}'/>
|
<java>
|
||||||
<delete path='${mdstoreOutputPath}'/>
|
<configuration>
|
||||||
</fs>
|
<property>
|
||||||
<ok to="TransformJob"/>
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>READ_LOCK</arg>
|
||||||
|
<arg>--mdStoreID</arg><arg>${mdStoreInputId}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
<capture-output/>
|
||||||
|
</java>
|
||||||
|
<ok to="StartTransaction"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
<action name="TransformJob">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<action name="StartTransaction">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<java>
|
||||||
<name-node>${nameNode}</name-node>
|
<configuration>
|
||||||
<master>yarn</master>
|
<property>
|
||||||
<mode>cluster</mode>
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
<name>MDBuilder</name>
|
<value>true</value>
|
||||||
<class>eu.dnetlib.dhp.transformation.TransformSparkJobNode</class>
|
</property>
|
||||||
<jar>dhp-aggregations-1.0.0-SNAPSHOT.jar</jar>
|
</configuration>
|
||||||
<spark-opts>--num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"</spark-opts>
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
<arg>--dateOfCollection</arg> <arg>${timestamp}</arg>
|
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||||
<arg>-mt</arg> <arg>yarn</arg>
|
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||||
<arg>--input</arg><arg>${mdstoreInputPath}</arg>
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
<arg>--output</arg><arg>${mdstoreOutputPath}</arg>
|
<capture-output/>
|
||||||
<arg>-w</arg><arg>${workflowId}</arg>
|
</java>
|
||||||
<arg>-tr</arg><arg>${transformationRule}</arg>
|
<ok to="TransformJob"/>
|
||||||
<arg>-ru</arg><arg>${rmq_user}</arg>
|
<error to="EndReadRollBack"/>
|
||||||
<arg>-rp</arg><arg>${rmq_pwd}</arg>
|
|
||||||
<arg>-rh</arg><arg>${rmq_host}</arg>
|
|
||||||
<arg>-ro</arg><arg>${rmq_ongoing}</arg>
|
|
||||||
<arg>-rr</arg><arg>${rmq_report}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="DropInvalidStore"/>
|
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="DropInvalidStore">
|
<action name="TransformJob">
|
||||||
<fs>
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<delete path='${mdstoreOutputPath}/../'/>
|
<master>yarn</master>
|
||||||
</fs>
|
<mode>cluster</mode>
|
||||||
|
<name>Transform MetadataStore</name>
|
||||||
|
<class>eu.dnetlib.dhp.transformation.TransformSparkJobNode</class>
|
||||||
|
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
|
<arg>--mdstoreInputVersion</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
|
||||||
|
<arg>--dateOfTransformation</arg><arg>${dateOfTransformation}</arg>
|
||||||
|
<arg>--transformationPlugin</arg><arg>${transformationPlugin}</arg>
|
||||||
|
<arg>--transformationRuleId</arg><arg>${transformationRuleId}</arg>
|
||||||
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--workflowId</arg><arg>${workflowId}</arg>
|
||||||
|
<arg>--dnetMessageManagerURL</arg><arg>${dnetMessageManagerURL}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="EndRead"/>
|
||||||
|
<error to="EndReadRollBack"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="EndRead">
|
||||||
|
<java>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>READ_UNLOCK</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
|
||||||
|
<capture-output/>
|
||||||
|
</java>
|
||||||
|
<ok to="CommitVersion"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="CommitVersion">
|
||||||
|
<java>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>COMMIT</arg>
|
||||||
|
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="EndReadRollBack">
|
||||||
|
<java>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>READ_UNLOCK</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
|
||||||
|
<capture-output/>
|
||||||
|
</java>
|
||||||
|
<ok to="RollBack"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="RollBack">
|
||||||
|
<java>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||||
|
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
</java>
|
||||||
<ok to="Kill"/>
|
<ok to="Kill"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -7,20 +7,39 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "d",
|
"paramName": "d",
|
||||||
"paramLongName": "dateOfCollection",
|
"paramLongName": "dateOfTransformation",
|
||||||
"paramDescription": "the date when the record has been stored",
|
"paramDescription": "the date when the record has been stored",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "i",
|
"paramName": "i",
|
||||||
"paramLongName": "input",
|
"paramLongName": "mdstoreInputVersion",
|
||||||
"paramDescription": "the path of the sequencial file to read",
|
"paramDescription": "the mdStore Version bean of the Input",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "output",
|
"paramLongName": "mdstoreOutputVersion",
|
||||||
"paramDescription": "the path of the result DataFrame on HDFS",
|
"paramDescription": "the mdStore Version bean of the Output",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "tr",
|
||||||
|
"paramLongName": "transformationRuleId",
|
||||||
|
"paramDescription": "the transformation Rule to apply to the input MDStore",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName": "i",
|
||||||
|
"paramLongName": "isLookupUrl",
|
||||||
|
"paramDescription": "the Information System Service LookUp URL",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "dm",
|
||||||
|
"paramLongName": "dnetMessageManagerURL",
|
||||||
|
"paramDescription": "the End point URL to send Messages",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -30,45 +49,9 @@
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "tr",
|
"paramName": "tp",
|
||||||
"paramLongName": "transformationRule",
|
"paramLongName": "transformationPlugin",
|
||||||
"paramDescription": "the transformation Rule to apply to the input MDStore",
|
"paramDescription": "the transformation plugin to apply",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "ru",
|
|
||||||
"paramLongName": "rabbitUser",
|
|
||||||
"paramDescription": "the user to connect with RabbitMq for messaging",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "rp",
|
|
||||||
"paramLongName": "rabbitPassword",
|
|
||||||
"paramDescription": "the password to connect with RabbitMq for messaging",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "rh",
|
|
||||||
"paramLongName": "rabbitHost",
|
|
||||||
"paramDescription": "the host of the RabbitMq server",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "ro",
|
|
||||||
"paramLongName": "rabbitOngoingQueue",
|
|
||||||
"paramDescription": "the name of the ongoing queue",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "rr",
|
|
||||||
"paramLongName": "rabbitReportQueue",
|
|
||||||
"paramDescription": "the name of the report queue",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "t",
|
|
||||||
"paramLongName": "isTest",
|
|
||||||
"paramDescription": "the name of the report queue",
|
|
||||||
"paramRequired": false
|
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -12,15 +12,15 @@ import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException;
|
|
||||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
|
||||||
import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser;
|
import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser;
|
||||||
|
import eu.dnetlib.dhp.collection.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.collection.HttpConnector2;
|
||||||
|
|
||||||
@Disabled
|
@Disabled
|
||||||
public class EXCELParserTest {
|
public class EXCELParserTest {
|
||||||
|
|
||||||
private static Path workingDir;
|
private static Path workingDir;
|
||||||
private HttpConnector httpConnector = new HttpConnector();
|
private HttpConnector2 httpConnector = new HttpConnector2();
|
||||||
private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
|
private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
|
@ -30,7 +30,7 @@ public class EXCELParserTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test1() throws CollectorServiceException, IOException, InvalidFormatException, ClassNotFoundException,
|
public void test1() throws CollectorException, IOException, InvalidFormatException, ClassNotFoundException,
|
||||||
IllegalAccessException, InstantiationException {
|
IllegalAccessException, InstantiationException {
|
||||||
|
|
||||||
EXCELParser excelParser = new EXCELParser();
|
EXCELParser excelParser = new EXCELParser();
|
||||||
|
|
|
@ -1,41 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.project.httpconnector;
|
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
|
||||||
import org.apache.commons.logging.LogFactory;
|
|
||||||
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
|
||||||
import org.apache.http.ssl.SSLContextBuilder;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
@Disabled
|
|
||||||
public class HttpConnectorTest {
|
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(HttpConnectorTest.class);
|
|
||||||
private static HttpConnector connector;
|
|
||||||
|
|
||||||
private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
|
|
||||||
private static final String URL_MISCONFIGURED_SERVER = "https://www.alexandria.unisg.ch/cgi/oai2?verb=Identify";
|
|
||||||
private static final String URL_GOODSNI_SERVER = "https://air.unimi.it/oai/openaire?verb=Identify";
|
|
||||||
|
|
||||||
private static final SSLContextBuilder sslContextBuilder = new SSLContextBuilder();
|
|
||||||
private static SSLConnectionSocketFactory sslSocketFactory;
|
|
||||||
|
|
||||||
@BeforeAll
|
|
||||||
public static void setUp() {
|
|
||||||
connector = new HttpConnector();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
|
|
||||||
public void testGetInputSource() throws CollectorServiceException {
|
|
||||||
System.out.println(connector.getInputSource(URL));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testGoodServers() throws CollectorServiceException {
|
|
||||||
System.out.println(connector.getInputSource(URL_GOODSNI_SERVER));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.aggregation;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.lenient;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.transformation.TransformationFactory;
|
||||||
|
import eu.dnetlib.dhp.transformation.TransformationJobTest;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
|
public abstract class AbstractVocabularyTest {
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
protected ISLookUpService isLookUpService;
|
||||||
|
|
||||||
|
protected VocabularyGroup vocabularies;
|
||||||
|
|
||||||
|
public void setUpVocabulary() throws ISLookUpException, IOException {
|
||||||
|
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
|
||||||
|
|
||||||
|
lenient()
|
||||||
|
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
|
||||||
|
.thenReturn(synonyms());
|
||||||
|
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<String> vocs() throws IOException {
|
||||||
|
return IOUtils
|
||||||
|
.readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<String> synonyms() throws IOException {
|
||||||
|
return IOUtils
|
||||||
|
.readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt"));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void mockupTrasformationRule(final String trule, final String path) throws Exception {
|
||||||
|
final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path));
|
||||||
|
|
||||||
|
lenient()
|
||||||
|
.when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY, trule)))
|
||||||
|
.thenReturn(Collections.singletonList(trValue));
|
||||||
|
}
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue