diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index b295bc1f19..4ee706169d 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -29,6 +29,12 @@
spark-sql_2.11
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${project.version}
+
+
commons-cli
commons-cli
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
index 3fa5fcbab2..17482c0198 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@@ -94,7 +94,13 @@ public class AuthorMerger {
if (r.getPid() == null) {
r.setPid(new ArrayList<>());
}
- r.getPid().add(a._1());
+
+ // TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
+ // it creates of fixed size, and the add method raise UnsupportedOperationException at
+ // java.util.AbstractList.add
+ final List tmp = new ArrayList<>(r.getPid());
+ tmp.add(a._1());
+ r.setPid(tmp);
}
}
});
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java
similarity index 79%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java
rename to dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java
index 84f88003b2..12fbcc490f 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java
@@ -1,8 +1,9 @@
-package eu.dnetlib.dhp.oa.graph.clean;
+package eu.dnetlib.dhp.schema.oaf;
import java.util.LinkedHashMap;
import java.util.Objects;
+import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
@@ -10,14 +11,13 @@ import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists;
-import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
-import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions {
+ public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
+ public static final String NONE = "none";
public static T fixVocabularyNames(T value) {
if (value instanceof Datasource) {
@@ -71,7 +71,7 @@ public class CleaningFunctions {
return value;
}
- protected static T fixDefaults(T value) {
+ public static T fixDefaults(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
@@ -106,6 +106,20 @@ public class CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.collect(Collectors.toList()));
}
+ if (Objects.nonNull(r.getPid())) {
+ r
+ .setPid(
+ r
+ .getPid()
+ .stream()
+ .filter(Objects::nonNull)
+ .filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
+ .filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
+ .filter(sp -> Objects.nonNull(sp.getQualifier()))
+ .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
+ .map(CleaningFunctions::normalizePidValue)
+ .collect(Collectors.toList()));
+ }
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r
.setResourcetype(
@@ -125,7 +139,7 @@ public class CleaningFunctions {
}
}
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
- Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance());
+ Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
if (Objects.isNull(bestaccessrights)) {
r
.setBestaccessright(
@@ -201,4 +215,24 @@ public class CleaningFunctions {
classid, classname, scheme, scheme);
}
+ /**
+ * Utility method that normalises PID values on a per-type basis.
+ * @param pid the PID whose value will be normalised.
+ * @return the PID containing the normalised value.
+ */
+ public static StructuredProperty normalizePidValue(StructuredProperty pid) {
+ String value = Optional
+ .ofNullable(pid.getValue())
+ .map(String::trim)
+ .orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
+ switch (pid.getQualifier().getClassid()) {
+
+ // TODO add cleaning for more PID types as needed
+ case "doi":
+ pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
+ break;
+ }
+ return pid;
+ }
+
}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java
similarity index 70%
rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
rename to dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java
index 9bc3706cdd..16fdc3760d 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java
@@ -1,14 +1,14 @@
-package eu.dnetlib.dhp.oa.provision;
+package eu.dnetlib.dhp.schema.oaf;
-public class ProvisionConstants {
+public class ModelHardLimits {
public static final int MAX_EXTERNAL_ENTITIES = 50;
public static final int MAX_AUTHORS = 200;
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
public static final int MAX_TITLE_LENGTH = 5000;
public static final int MAX_TITLES = 10;
- public static final int MAX_ABSTRACT_LENGTH = 100000;
+ public static final int MAX_ABSTRACT_LENGTH = 150000;
public static final int MAX_INSTANCES = 10;
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java
similarity index 84%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java
rename to dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java
index 84b29e3d48..f079c55afd 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java
@@ -1,11 +1,10 @@
-package eu.dnetlib.dhp.oa.graph.raw.common;
+package eu.dnetlib.dhp.schema.oaf;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.Objects;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
+
+import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.function.Predicate;
@@ -13,15 +12,7 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
-import eu.dnetlib.dhp.schema.oaf.DataInfo;
-import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
-import eu.dnetlib.dhp.schema.oaf.Field;
-import eu.dnetlib.dhp.schema.oaf.Journal;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
-import eu.dnetlib.dhp.schema.oaf.OriginDescription;
-import eu.dnetlib.dhp.schema.oaf.Qualifier;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.common.LicenseComparator;
import eu.dnetlib.dhp.utils.DHPUtils;
public class OafMapperUtils {
@@ -270,4 +261,36 @@ public class OafMapperUtils {
final Map
+
+ org.apache.httpcomponents
+ httpmime
+
org.elasticsearch
diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java
new file mode 100644
index 0000000000..95bea74a2b
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java
@@ -0,0 +1,111 @@
+
+package eu.dnetlib.dhp.export.zenodo;
+
+import java.io.*;
+
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+public class MakeTar implements Serializable {
+
+ private static final Logger log = LoggerFactory.getLogger(MakeTar.class);
+
+ public static void main(String[] args) throws Exception {
+ String jsonConfiguration = IOUtils
+ .toString(
+ MakeTar.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/export/input_maketar_parameters.json"));
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+ parser.parseArgument(args);
+
+ final String outputPath = parser.get("targetPath");
+ log.info("hdfsPath: {}", outputPath);
+
+ final String hdfsNameNode = parser.get("nameNode");
+ log.info("nameNode: {}", hdfsNameNode);
+
+ final String inputPath = parser.get("sourcePath");
+ log.info("input path : {}", inputPath);
+
+ Configuration conf = new Configuration();
+ conf.set("fs.defaultFS", hdfsNameNode);
+
+ FileSystem fileSystem = FileSystem.get(conf);
+
+ makeTArArchive(fileSystem, inputPath, outputPath);
+
+ }
+
+ public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException {
+
+ RemoteIterator dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath));
+
+ while (dir_iterator.hasNext()) {
+ LocatedFileStatus fileStatus = dir_iterator.next();
+
+ Path p = fileStatus.getPath();
+ String p_string = p.toString();
+ String entity = p_string.substring(p_string.lastIndexOf("/") + 1);
+
+ write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity);
+ }
+
+ }
+
+ private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
+ throws IOException {
+
+ Path hdfsWritePath = new Path(outputPath);
+ FSDataOutputStream fsDataOutputStream = null;
+ if (fileSystem.exists(hdfsWritePath)) {
+ fileSystem.delete(hdfsWritePath, true);
+
+ }
+ fsDataOutputStream = fileSystem.create(hdfsWritePath);
+
+ TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
+
+ RemoteIterator fileStatusListIterator = fileSystem
+ .listFiles(
+ new Path(inputPath), true);
+
+ while (fileStatusListIterator.hasNext()) {
+ LocatedFileStatus fileStatus = fileStatusListIterator.next();
+
+ Path p = fileStatus.getPath();
+ String p_string = p.toString();
+ if (!p_string.endsWith("_SUCCESS")) {
+ String name = p_string.substring(p_string.lastIndexOf("/") + 1);
+ TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz");
+ entry.setSize(fileStatus.getLen());
+ ar.putArchiveEntry(entry);
+
+ InputStream is = fileSystem.open(fileStatus.getPath());
+
+ BufferedInputStream bis = new BufferedInputStream(is);
+
+ int count;
+ byte data[] = new byte[1024];
+ while ((count = bis.read(data, 0, data.length)) != -1) {
+ ar.write(data, 0, count);
+ }
+ bis.close();
+ ar.closeArchiveEntry();
+
+ }
+
+ }
+
+ ar.close();
+ }
+
+}
diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java
new file mode 100644
index 0000000000..1dcbf6cccc
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java
@@ -0,0 +1,80 @@
+
+package eu.dnetlib.dhp.export.zenodo;
+
+import java.io.Serializable;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.*;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.api.MissingConceptDoiException;
+import eu.dnetlib.dhp.common.api.ZenodoAPIClient;
+
+public class SendToZenodoHDFS implements Serializable {
+
+ private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class);
+
+ public static void main(final String[] args) throws Exception, MissingConceptDoiException {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ SendToZenodoHDFS.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/export/upload_zenodo.json")));
+
+ parser.parseArgument(args);
+
+ final String hdfsPath = parser.get("hdfsPath");
+ final String hdfsNameNode = parser.get("nameNode");
+ final String access_token = parser.get("accessToken");
+ final String connection_url = parser.get("connectionUrl");
+ final String metadata = parser.get("metadata");
+ final Boolean newDeposition = Boolean.valueOf(parser.get("newDeposition"));
+ final String concept_rec_id = Optional
+ .ofNullable(parser.get("conceptRecordId"))
+ .orElse(null);
+
+ Configuration conf = new Configuration();
+ conf.set("fs.defaultFS", hdfsNameNode);
+
+ FileSystem fileSystem = FileSystem.get(conf);
+
+ RemoteIterator fileStatusListIterator = fileSystem
+ .listFiles(
+ new Path(hdfsPath), true);
+ ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token);
+ if (newDeposition) {
+ zenodoApiClient.newDeposition();
+ } else {
+ if (concept_rec_id == null) {
+ throw new MissingConceptDoiException("No concept record id has been provided");
+ }
+ zenodoApiClient.newVersion(concept_rec_id);
+ }
+
+ while (fileStatusListIterator.hasNext()) {
+ LocatedFileStatus fileStatus = fileStatusListIterator.next();
+
+ Path p = fileStatus.getPath();
+ String p_string = p.toString();
+ if (!p_string.endsWith("_SUCCESS")) {
+ // String tmp = p_string.substring(0, p_string.lastIndexOf("/"));
+ String name = p_string.substring(p_string.lastIndexOf("/") + 1);
+ log.info("Sending information for community: " + name);
+ FSDataInputStream inputStream = fileSystem.open(p);
+ zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen());
+
+ }
+
+ }
+
+ zenodoApiClient.sendMretadata(metadata);
+ zenodoApiClient.publish();
+
+ }
+
+}
diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json
new file mode 100644
index 0000000000..6d90ced2cb
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json
@@ -0,0 +1,20 @@
+[
+ {
+ "paramName": "n",
+ "paramLongName": "nameNode",
+ "paramDescription": "the Name Node",
+ "paramRequired": true
+ },
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePath",
+ "paramDescription": "the source path",
+ "paramRequired": true
+ },
+ {
+ "paramName": "t",
+ "paramLongName": "targetPath",
+ "paramDescription": "the target path",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/upload_zenodo.json b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/upload_zenodo.json
new file mode 100644
index 0000000000..66676005e9
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/upload_zenodo.json
@@ -0,0 +1,45 @@
+
+[
+ {
+ "paramName":"nd",
+ "paramLongName":"newDeposition",
+ "paramDescription": "if it is a new deposition (true) or a new version (false)",
+ "paramRequired": true
+ },
+ {
+ "paramName":"cri",
+ "paramLongName":"conceptRecordId",
+ "paramDescription": "The id of the concept record for a new version",
+ "paramRequired": false
+ },
+ {
+ "paramName":"hdfsp",
+ "paramLongName":"hdfsPath",
+ "paramDescription": "the path of the folder tofind files to send to Zenodo",
+ "paramRequired": true
+ },
+ {
+ "paramName": "nn",
+ "paramLongName": "nameNode",
+ "paramDescription": "the name node",
+ "paramRequired": true
+ },
+ {
+ "paramName": "at",
+ "paramLongName": "accessToken",
+ "paramDescription": "the access token for the deposition",
+ "paramRequired": false
+ },
+ {
+ "paramName":"cu",
+ "paramLongName":"connectionUrl",
+ "paramDescription": "the url to connect to deposit",
+ "paramRequired": false
+ },
+ {
+ "paramName":"m",
+ "paramLongName":"metadata",
+ "paramDescription": "metadata associated to the deposition",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/config-default.xml
new file mode 100644
index 0000000000..3b9aaca2a6
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/config-default.xml
@@ -0,0 +1,48 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ oozie.wf.rerun.failnodes
+ false
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ spark2ExtraListeners
+ "com.cloudera.spark.lineage.NavigatorAppListener"
+
+
+ spark2SqlQueryExecutionListeners
+ "com.cloudera.spark.lineage.NavigatorQueryListener"
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml
new file mode 100644
index 0000000000..6d70565033
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml
@@ -0,0 +1,53 @@
+
+
+
+ sourcePath
+ the source path
+
+
+ targetPath
+ the target path
+
+
+ metadata
+ the metadata
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.dhp.export.zenodo.MakeTar
+ -t${targetPath}
+ -n${nameNode}
+ -s${sourcePath}
+
+
+
+
+
+
+
+
+ eu.dnetlib.dhp.export.zenodo.SendToZenodoHDFS
+ --hdfsPath/user/dnet.scholexplorer/scholix/provision/scholix.tar/scholix-2020-10-16.tar
+ --nameNode${nameNode}
+ --accessTokenb6ddrY6b77WxcDEevn9gqVE5sL5sDNjdUijt75W3o7cQo5vpFFI48dMiu8Gv
+ --connectionUrlhttps://zenodo.org/api/deposit/depositions
+ --metadata${metadata}
+ --conceptRecordId1200252
+ --newDepositionfalse
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala
index cb04cf9e95..c62d169bc8 100644
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala
@@ -3,14 +3,15 @@ package eu.dnetlib.dhp.export
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
+import eu.dnetlib.dhp.provision.scholix.Scholix
+import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
-
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.junit.jupiter.api.Test
import scala.io.Source
-
+import scala.collection.JavaConverters._
class ExportDLITOOAFTest {
val mapper = new ObjectMapper()
@@ -22,12 +23,27 @@ class ExportDLITOOAFTest {
}
+ def extractDatasources(s:Scholix):List[String]= {
+ s.getTarget.getCollectedFrom.asScala.map(c => c.getProvider.getName)(collection.breakOut)
+ }
+
+
+ def extractDatasources(s:ScholixSummary):List[String] = {
+
+ s.getDatasources.asScala.map(c => c.getDatasourceName)(collection.breakOut)
+
+
+ }
+
+
@Test
def testMappingRele():Unit = {
val r:Relation = new Relation
r.setSource("60|fbff1d424e045eecf24151a5fe3aa738")
r.setTarget("50|dedup_wf_001::ec409f09e63347d4e834087fe1483877")
+ r.setRelType("IsReferencedBy")
+
val r1 =DLIToOAF.convertDLIRelation(r)
println(r1.getSource, r1.getTarget)
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
index b08e593f73..d404850ebe 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@@ -164,7 +164,7 @@ public class CreateRelatedEntitiesJob_phase1 {
if (result.getTitle() != null && !result.getTitle().isEmpty()) {
final StructuredProperty title = result.getTitle().stream().findFirst().get();
- title.setValue(StringUtils.left(title.getValue(), ProvisionConstants.MAX_TITLE_LENGTH));
+ title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
re.setTitle(title);
}
@@ -178,7 +178,7 @@ public class CreateRelatedEntitiesJob_phase1 {
.getInstance()
.stream()
.filter(Objects::nonNull)
- .limit(ProvisionConstants.MAX_INSTANCES)
+ .limit(ModelHardLimits.MAX_INSTANCES)
.collect(Collectors.toList()));
}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
index 7e175121e5..e32fe020b9 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
@@ -240,15 +240,15 @@ public class CreateRelatedEntitiesJob_phase2 {
List refs = r
.getExternalReference()
.stream()
- .limit(ProvisionConstants.MAX_EXTERNAL_ENTITIES)
+ .limit(ModelHardLimits.MAX_EXTERNAL_ENTITIES)
.collect(Collectors.toList());
r.setExternalReference(refs);
}
if (r.getAuthor() != null) {
List authors = Lists.newArrayList();
for (Author a : r.getAuthor()) {
- a.setFullname(StringUtils.left(a.getFullname(), ProvisionConstants.MAX_AUTHOR_FULLNAME_LENGTH));
- if (authors.size() < ProvisionConstants.MAX_AUTHORS || hasORCID(a)) {
+ a.setFullname(StringUtils.left(a.getFullname(), ModelHardLimits.MAX_AUTHOR_FULLNAME_LENGTH));
+ if (authors.size() < ModelHardLimits.MAX_AUTHORS || hasORCID(a)) {
authors.add(a);
}
}
@@ -260,7 +260,7 @@ public class CreateRelatedEntitiesJob_phase2 {
.stream()
.filter(Objects::nonNull)
.map(d -> {
- d.setValue(StringUtils.left(d.getValue(), ProvisionConstants.MAX_ABSTRACT_LENGTH));
+ d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
return d;
})
.collect(Collectors.toList());
@@ -272,10 +272,10 @@ public class CreateRelatedEntitiesJob_phase2 {
.stream()
.filter(Objects::nonNull)
.map(t -> {
- t.setValue(StringUtils.left(t.getValue(), ProvisionConstants.MAX_TITLE_LENGTH));
+ t.setValue(StringUtils.left(t.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
return t;
})
- .limit(ProvisionConstants.MAX_TITLES)
+ .limit(ModelHardLimits.MAX_TITLES)
.collect(Collectors.toList());
r.setTitle(titles);
}
diff --git a/dhp-workflows/dhp-workflow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_prod_construction.xml b/dhp-workflows/dhp-workflow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_prod_construction.xml
index 0474333203..4eab12c73a 100644
--- a/dhp-workflows/dhp-workflow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_prod_construction.xml
+++ b/dhp-workflows/dhp-workflow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_prod_construction.xml
@@ -44,6 +44,7 @@
+
Set the target path to store the RAW graph
@@ -54,31 +55,45 @@
+
+
+ Set the target path to store the first CLEANED graph
+
+ firstCleanedGraphPath
+ /tmp/prod_provision/graph/02_graph_first_cleaned
+
+
+
+
+
+
Set the target path to store the DEDUPED graph
dedupGraphPath
- /tmp/beta_provision/graph/02_graph_dedup
+ /tmp/beta_provision/graph/03_graph_dedup
+
Set the target path to store the INFERRED graph
inferredGraphPath
- /tmp/beta_provision/graph/03_graph_inferred
+ /tmp/beta_provision/graph/04_graph_inferred
+
Set the target path to store the CONSISTENCY graph
consistentGraphPath
- /tmp/beta_provision/graph/04_graph_consistent
+ /tmp/beta_provision/graph/05_graph_consistent
@@ -89,7 +104,7 @@
Set the target path to store the ORCID enriched graph
orcidGraphPath
- /tmp/beta_provision/graph/05_graph_orcid
+ /tmp/beta_provision/graph/06_graph_orcid
@@ -100,7 +115,7 @@
Set the target path to store the BULK TAGGED graph
bulkTaggingGraphPath
- /tmp/beta_provision/graph/06_graph_bulktagging
+ /tmp/beta_provision/graph/07_graph_bulktagging
@@ -111,7 +126,7 @@
Set the target path to store the AFFILIATION from INSTITUTIONAL REPOS graph
affiliationGraphPath
- /tmp/beta_provision/graph/07_graph_affiliation
+ /tmp/beta_provision/graph/08_graph_affiliation
@@ -122,7 +137,7 @@
Set the target path to store the COMMUNITY from SELECTED SOURCES graph
communityOrganizationGraphPath
- /tmp/beta_provision/graph/08_graph_comunity_organization
+ /tmp/beta_provision/graph/09_graph_comunity_organization
@@ -133,7 +148,7 @@
Set the target path to store the FUNDING from SEMANTIC RELATION graph
fundingGraphPath
- /tmp/beta_provision/graph/09_graph_funding
+ /tmp/beta_provision/graph/10_graph_funding
@@ -144,7 +159,7 @@
Set the target path to store the COMMUNITY from SEMANTIC RELATION graph
communitySemRelGraphPath
- /tmp/beta_provision/graph/10_graph_comunity_sem_rel
+ /tmp/beta_provision/graph/11_graph_comunity_sem_rel
@@ -155,7 +170,7 @@
Set the target path to store the COUNTRY enriched graph
countryGraphPath
- /tmp/beta_provision/graph/11_graph_country
+ /tmp/beta_provision/graph/12_graph_country
@@ -166,7 +181,7 @@
Set the target path to store the CLEANED graph
cleanedGraphPath
- /tmp/beta_provision/graph/12_graph_cleaned
+ /tmp/beta_provision/graph/13_graph_cleaned
@@ -177,7 +192,7 @@
Set the target path to store the blacklisted graph
blacklistedGraphPath
- /tmp/beta_provision/graph/13_graph_blacklisted
+ /tmp/beta_provision/graph/14_graph_blacklisted
@@ -324,6 +339,31 @@
build-report
+
+
+
+
+
+
+ clean the properties in the graph typed as Qualifier according to the vocabulary indicated in schemeid
+
+ executeOozieJob
+ IIS
+
+ {
+ 'graphInputPath' : 'rawGraphPath',
+ 'graphOutputPath': 'firstCleanedGraphPath',
+ 'isLookupUrl': 'isLookUpUrl'
+ }
+
+
+ {
+ 'oozie.wf.application.path' : '/lib/dnet/oa/graph/clean/oozie_app',
+ 'workingPath' : '/tmp/beta_provision/working_dir/first_clean'
+ }
+
+ build-report
+
@@ -337,7 +377,7 @@
{
'actionSetId' : 'dedupConfig',
- 'graphBasePath' : 'rawGraphPath',
+ 'graphBasePath' : 'firstCleanedGraphPath',
'dedupGraphPath': 'dedupGraphPath',
'isLookUpUrl' : 'isLookUpUrl'
}