forked from antonis.lempesis/dnet-hadoop
Merge remote-tracking branch 'upstream/master'
This commit is contained in:
commit
ab08a37024
Binary file not shown.
After Width: | Height: | Size: 689 KiB |
|
@ -30,6 +30,12 @@
|
|||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>${junit.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
|
|
|
@ -1,66 +1,83 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.Assert;
|
||||
|
||||
public class Relation extends Oaf {
|
||||
|
||||
private String relType;
|
||||
private String relType;
|
||||
|
||||
private String subRelType;
|
||||
private String subRelType;
|
||||
|
||||
private String relClass;
|
||||
private String relClass;
|
||||
|
||||
private String source;
|
||||
private String source;
|
||||
|
||||
private String target;
|
||||
private String target;
|
||||
|
||||
private List<KeyValue> collectedFrom;
|
||||
private List<KeyValue> collectedFrom = new ArrayList<>();
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public void setRelType(String relType) {
|
||||
this.relType = relType;
|
||||
}
|
||||
public void setRelType(final String relType) {
|
||||
this.relType = relType;
|
||||
}
|
||||
|
||||
public String getSubRelType() {
|
||||
return subRelType;
|
||||
}
|
||||
public String getSubRelType() {
|
||||
return subRelType;
|
||||
}
|
||||
|
||||
public void setSubRelType(String subRelType) {
|
||||
this.subRelType = subRelType;
|
||||
}
|
||||
public void setSubRelType(final String subRelType) {
|
||||
this.subRelType = subRelType;
|
||||
}
|
||||
|
||||
public String getRelClass() {
|
||||
return relClass;
|
||||
}
|
||||
public String getRelClass() {
|
||||
return relClass;
|
||||
}
|
||||
|
||||
public void setRelClass(String relClass) {
|
||||
this.relClass = relClass;
|
||||
}
|
||||
public void setRelClass(final String relClass) {
|
||||
this.relClass = relClass;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public void setSource(String source) {
|
||||
this.source = source;
|
||||
}
|
||||
public void setSource(final String source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public String getTarget() {
|
||||
return target;
|
||||
}
|
||||
public String getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
public void setTarget(String target) {
|
||||
this.target = target;
|
||||
}
|
||||
public void setTarget(final String target) {
|
||||
this.target = target;
|
||||
}
|
||||
|
||||
public List<KeyValue> getCollectedFrom() {
|
||||
return collectedFrom;
|
||||
}
|
||||
public List<KeyValue> getCollectedFrom() {
|
||||
return collectedFrom;
|
||||
}
|
||||
|
||||
public void setCollectedFrom(final List<KeyValue> collectedFrom) {
|
||||
this.collectedFrom = collectedFrom;
|
||||
}
|
||||
|
||||
public void mergeFrom(final Relation r) {
|
||||
Assert.assertEquals("source ids must be equal", getSource(), r.getSource());
|
||||
Assert.assertEquals("target ids must be equal", getTarget(), r.getTarget());
|
||||
Assert.assertEquals("relType(s) must be equal", getRelType(), r.getRelType());
|
||||
Assert.assertEquals("subRelType(s) must be equal", getSubRelType(), r.getSubRelType());
|
||||
Assert.assertEquals("relClass(es) must be equal", getRelClass(), r.getRelClass());
|
||||
setCollectedFrom(Stream.concat(getCollectedFrom().stream(), r.getCollectedFrom().stream())
|
||||
.distinct() // relies on KeyValue.equals
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
public void setCollectedFrom(List<KeyValue> collectedFrom) {
|
||||
this.collectedFrom = collectedFrom;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ import java.io.Serializable;
|
|||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
public abstract class Result extends OafEntity implements Serializable {
|
||||
public class Result extends OafEntity implements Serializable {
|
||||
|
||||
private List<Author> author;
|
||||
|
||||
|
|
|
@ -24,6 +24,12 @@
|
|||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>com.sun.xml.bind</groupId>
|
||||
<artifactId>jaxb-core</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -32,6 +38,49 @@
|
|||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-common</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>saxonica</groupId>
|
||||
<artifactId>saxon</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>saxonica</groupId>
|
||||
<artifactId>saxon-dom</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>jgrapht</groupId>
|
||||
<artifactId>jgrapht</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>net.sf.ehcache</groupId>
|
||||
<artifactId>ehcache</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-test</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.apache.*</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>apache</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaire-data-protos</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
|
@ -55,6 +104,11 @@
|
|||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-distcp</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.postgresql</groupId>
|
||||
|
|
|
@ -1,56 +0,0 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class ExtractEntitiesFromHDFSJob {
|
||||
|
||||
|
||||
private static List<String> folderNames = Arrays.asList("db_entities", "oaf_entities", "odf_entities");
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(ExtractEntitiesFromHDFSJob.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
final String targetPath = parser.get("graphRawPath");
|
||||
final String entity = parser.get("entity");
|
||||
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
|
||||
JavaRDD<String> inputRdd = sc.emptyRDD();
|
||||
|
||||
|
||||
folderNames.forEach(p -> inputRdd.union(
|
||||
sc.sequenceFile(sourcePath+"/"+p, Text.class, Text.class)
|
||||
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
||||
.filter(k -> isEntityType(k._1(), entity))
|
||||
.map(Tuple2::_2))
|
||||
);
|
||||
|
||||
inputRdd.saveAsTextFile(targetPath+"/"+entity);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isEntityType(final String item, final String entity) {
|
||||
return StringUtils.substringAfter(item, ":").equalsIgnoreCase(entity);
|
||||
}
|
||||
}
|
|
@ -1,45 +0,0 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class MigrateMongoMdstoresApplication {
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String mongoBaseUrl = parser.get("mongoBaseUrl");
|
||||
final String mongoDb = parser.get("mongoDb");
|
||||
|
||||
final String mdFormat = parser.get("mdFormat");
|
||||
final String mdLayout = parser.get("mdLayout");
|
||||
final String mdInterpretation = parser.get("mdInterpretation");
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("namenode");
|
||||
final String hdfsUser = parser.get("hdfsUser");
|
||||
|
||||
final String dbUrl = parser.get("postgresUrl");
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
|
||||
if (mdFormat.equalsIgnoreCase("oaf")) {
|
||||
try (final OafMigrationExecutor mig =
|
||||
new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
|
||||
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
|
||||
}
|
||||
} else if (mdFormat.equalsIgnoreCase("odf")) {
|
||||
try (final OdfMigrationExecutor mig =
|
||||
new OdfMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
|
||||
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
|
||||
}
|
||||
} else {
|
||||
throw new RuntimeException("Format not supported: " + mdFormat);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package eu.dnetlib.dhp.migration.actions;
|
||||
|
||||
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class LicenseComparator implements Comparator<Qualifier> {
|
||||
|
||||
@Override
|
||||
public int compare(Qualifier left, Qualifier right) {
|
||||
|
||||
if (left == null && right == null) return 0;
|
||||
if (left == null) return 1;
|
||||
if (right == null) return -1;
|
||||
|
||||
String lClass = left.getClassid();
|
||||
String rClass = right.getClassid();
|
||||
|
||||
if (lClass.equals(rClass)) return 0;
|
||||
|
||||
if (lClass.equals("OPEN SOURCE")) return -1;
|
||||
if (rClass.equals("OPEN SOURCE")) return 1;
|
||||
|
||||
if (lClass.equals("OPEN")) return -1;
|
||||
if (rClass.equals("OPEN")) return 1;
|
||||
|
||||
if (lClass.equals("6MONTHS")) return -1;
|
||||
if (rClass.equals("6MONTHS")) return 1;
|
||||
|
||||
if (lClass.equals("12MONTHS")) return -1;
|
||||
if (rClass.equals("12MONTHS")) return 1;
|
||||
|
||||
if (lClass.equals("EMBARGO")) return -1;
|
||||
if (rClass.equals("EMBARGO")) return 1;
|
||||
|
||||
if (lClass.equals("RESTRICTED")) return -1;
|
||||
if (rClass.equals("RESTRICTED")) return 1;
|
||||
|
||||
if (lClass.equals("CLOSED")) return -1;
|
||||
if (rClass.equals("CLOSED")) return 1;
|
||||
|
||||
if (lClass.equals("UNKNOWN")) return -1;
|
||||
if (rClass.equals("UNKNOWN")) return 1;
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
package eu.dnetlib.dhp.migration.actions;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.tools.DistCp;
|
||||
import org.apache.hadoop.tools.DistCpOptions;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class MigrateActionSet {
|
||||
|
||||
private static final Log log = LogFactory.getLog(MigrateActionSet.class);
|
||||
|
||||
private static final String SEPARATOR = "/";
|
||||
private static final String TARGET_PATHS = "target_paths";
|
||||
private static final String RAWSET_PREFIX = "rawset_";
|
||||
|
||||
private static Boolean DEFAULT_TRANSFORM_ONLY = false;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateActionSet.class.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
new MigrateActionSet().run(parser);
|
||||
}
|
||||
|
||||
private void run(ArgumentApplicationParser parser) throws Exception {
|
||||
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
final String sourceNN = parser.get("sourceNameNode");
|
||||
final String targetNN = parser.get("targetNameNode");
|
||||
final String workDir = parser.get("workingDirectory");
|
||||
final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps"));
|
||||
|
||||
final String distcp_memory_mb = parser.get("distcp_memory_mb");
|
||||
final String distcp_task_timeout = parser.get("distcp_task_timeout");
|
||||
|
||||
final String transform_only_s = parser.get("transform_only");
|
||||
|
||||
log.info("transform only param: " + transform_only_s);
|
||||
|
||||
final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only"));
|
||||
|
||||
log.info("transform only: " + transformOnly);
|
||||
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
|
||||
Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
|
||||
FileSystem targetFS = FileSystem.get(conf);
|
||||
|
||||
Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
|
||||
sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN);
|
||||
FileSystem sourceFS = FileSystem.get(sourceConf);
|
||||
|
||||
Properties props = new Properties();
|
||||
|
||||
List<Path> targetPaths = new ArrayList<>();
|
||||
|
||||
final List<Path> sourcePaths = getSourcePaths(sourceNN, isLookUp);
|
||||
log.info(String.format("paths to process:\n%s", sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n"))));
|
||||
for(Path source : sourcePaths) {
|
||||
|
||||
if (!sourceFS.exists(source)) {
|
||||
log.warn(String.format("skipping unexisting path: %s", source));
|
||||
} else {
|
||||
|
||||
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath()));
|
||||
|
||||
final String rawSet = pathQ.pollLast();
|
||||
log.info(String.format("got RAWSET: %s", rawSet));
|
||||
|
||||
if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) {
|
||||
|
||||
final String actionSetDirectory = pathQ.pollLast();
|
||||
|
||||
final Path targetPath = new Path(targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet);
|
||||
|
||||
log.info(String.format("using TARGET PATH: %s", targetPath));
|
||||
|
||||
if (!transformOnly) {
|
||||
if (targetFS.exists(targetPath)) {
|
||||
targetFS.delete(targetPath, true);
|
||||
}
|
||||
runDistcp(distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath);
|
||||
}
|
||||
|
||||
targetPaths.add(targetPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
props.setProperty(TARGET_PATHS, targetPaths
|
||||
.stream()
|
||||
.map(p -> p.toString())
|
||||
.collect(Collectors.joining(",")));
|
||||
File file = new File(System.getProperty("oozie.action.output.properties"));
|
||||
|
||||
try(OutputStream os = new FileOutputStream(file)) {
|
||||
props.store(os, "");
|
||||
}
|
||||
System.out.println(file.getAbsolutePath());
|
||||
}
|
||||
|
||||
private void runDistcp(Integer distcp_num_maps, String distcp_memory_mb, String distcp_task_timeout, Configuration conf, Path source, Path targetPath) throws Exception {
|
||||
|
||||
final DistCpOptions op = new DistCpOptions(source, targetPath);
|
||||
op.setMaxMaps(distcp_num_maps);
|
||||
op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE);
|
||||
op.preserve(DistCpOptions.FileAttribute.REPLICATION);
|
||||
op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE);
|
||||
|
||||
int res = ToolRunner.run(new DistCp(conf, op), new String[]{
|
||||
"-Dmapred.task.timeout=" + distcp_task_timeout,
|
||||
"-Dmapreduce.map.memory.mb=" + distcp_memory_mb,
|
||||
"-pb",
|
||||
"-m " + distcp_num_maps,
|
||||
source.toString(),
|
||||
targetPath.toString()});
|
||||
|
||||
if (res != 0) {
|
||||
throw new RuntimeException(String.format("distcp exited with code %s", res));
|
||||
}
|
||||
}
|
||||
|
||||
private Configuration getConfiguration(String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) {
|
||||
final Configuration conf = new Configuration();
|
||||
conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout);
|
||||
conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout);
|
||||
conf.set("dfs.http.client.retry.policy.enabled", "true");
|
||||
conf.set("mapred.task.timeout", distcp_task_timeout);
|
||||
conf.set("mapreduce.map.memory.mb", distcp_memory_mb);
|
||||
conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps));
|
||||
return conf;
|
||||
}
|
||||
|
||||
private List<Path> getSourcePaths(String sourceNN, ISLookUpService isLookUp) throws ISLookUpException {
|
||||
String XQUERY = "distinct-values(\n" +
|
||||
"let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" +
|
||||
"for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" +
|
||||
"let $setDir := $x//SET/@directory/string()\n" +
|
||||
"let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" +
|
||||
"return concat($basePath, '/', $setDir, '/', $rawSet))";
|
||||
|
||||
log.info(String.format("running xquery:\n%s", XQUERY));
|
||||
return isLookUp.quickSearchProfile(XQUERY)
|
||||
.stream()
|
||||
.map(p -> sourceNN + p)
|
||||
.map(Path::new)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,580 @@
|
|||
package eu.dnetlib.dhp.migration.actions;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.googlecode.protobuf.format.JsonFormat;
|
||||
import eu.dnetlib.data.proto.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class ProtoConverter implements Serializable {
|
||||
|
||||
public static final String UNKNOWN = "UNKNOWN";
|
||||
public static final String NOT_AVAILABLE = "not available";
|
||||
public static final String DNET_ACCESS_MODES = "dnet:access_modes";
|
||||
|
||||
public static Oaf convert(OafProtos.Oaf oaf) {
|
||||
try {
|
||||
switch (oaf.getKind()) {
|
||||
case entity:
|
||||
return convertEntity(oaf);
|
||||
case relation:
|
||||
return convertRelation(oaf);
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid kind " + oaf.getKind());
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e);
|
||||
}
|
||||
}
|
||||
|
||||
private static Relation convertRelation(OafProtos.Oaf oaf) {
|
||||
final OafProtos.OafRel r = oaf.getRel();
|
||||
final Relation rel = new Relation();
|
||||
rel.setDataInfo(mapDataInfo(oaf.getDataInfo()));
|
||||
rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp());
|
||||
rel.setSource(r.getSource());
|
||||
rel.setTarget(r.getTarget());
|
||||
rel.setRelType(r.getRelType().toString());
|
||||
rel.setSubRelType(r.getSubRelType().toString());
|
||||
rel.setRelClass(r.getRelClass());
|
||||
rel.setCollectedFrom(r.getCollectedfromCount() > 0 ?
|
||||
r.getCollectedfromList().stream()
|
||||
.map(kv -> mapKV(kv))
|
||||
.collect(Collectors.toList()) : null);
|
||||
return rel;
|
||||
}
|
||||
|
||||
private static OafEntity convertEntity(OafProtos.Oaf oaf) {
|
||||
|
||||
switch (oaf.getEntity().getType()) {
|
||||
case result:
|
||||
final Result r = convertResult(oaf);
|
||||
r.setInstance(convertInstances(oaf));
|
||||
return r;
|
||||
case project:
|
||||
return convertProject(oaf);
|
||||
case datasource:
|
||||
return convertDataSource(oaf);
|
||||
case organization:
|
||||
return convertOrganization(oaf);
|
||||
default:
|
||||
throw new RuntimeException("received unknown type");
|
||||
}
|
||||
}
|
||||
|
||||
private static List<Instance> convertInstances(OafProtos.Oaf oaf) {
|
||||
|
||||
final ResultProtos.Result r = oaf.getEntity().getResult();
|
||||
if (r.getInstanceCount() > 0) {
|
||||
return r.getInstanceList()
|
||||
.stream()
|
||||
.map(i -> convertInstance(i))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
|
||||
private static Instance convertInstance(ResultProtos.Result.Instance ri) {
|
||||
final Instance i = new Instance();
|
||||
i.setAccessright(mapQualifier(ri.getAccessright()));
|
||||
i.setCollectedfrom(mapKV(ri.getCollectedfrom()));
|
||||
i.setDateofacceptance(mapStringField(ri.getDateofacceptance()));
|
||||
i.setDistributionlocation(ri.getDistributionlocation());
|
||||
i.setHostedby(mapKV(ri.getHostedby()));
|
||||
i.setInstancetype(mapQualifier(ri.getInstancetype()));
|
||||
i.setLicense(mapStringField(ri.getLicense()));
|
||||
i.setUrl(ri.getUrlList());
|
||||
i.setRefereed(mapStringField(ri.getRefereed()));
|
||||
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
|
||||
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
|
||||
return i;
|
||||
}
|
||||
|
||||
private static Organization convertOrganization(OafProtos.Oaf oaf) {
|
||||
final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata();
|
||||
final Organization org = setOaf(new Organization(), oaf);
|
||||
setEntity(org, oaf);
|
||||
org.setLegalshortname(mapStringField(m.getLegalshortname()));
|
||||
org.setLegalname(mapStringField(m.getLegalname()));
|
||||
org.setAlternativeNames(m.getAlternativeNamesList().
|
||||
stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
org.setWebsiteurl(mapStringField(m.getWebsiteurl()));
|
||||
org.setLogourl(mapStringField(m.getLogourl()));
|
||||
org.setEclegalbody(mapStringField(m.getEclegalbody()));
|
||||
org.setEclegalperson(mapStringField(m.getEclegalperson()));
|
||||
org.setEcnonprofit(mapStringField(m.getEcnonprofit()));
|
||||
org.setEcresearchorganization(mapStringField(m.getEcresearchorganization()));
|
||||
org.setEchighereducation(mapStringField(m.getEchighereducation()));
|
||||
org.setEcinternationalorganizationeurinterests(mapStringField(m.getEcinternationalorganizationeurinterests()));
|
||||
org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization()));
|
||||
org.setEcenterprise(mapStringField(m.getEcenterprise()));
|
||||
org.setEcsmevalidated(mapStringField(m.getEcsmevalidated()));
|
||||
org.setEcnutscode(mapStringField(m.getEcnutscode()));
|
||||
org.setCountry(mapQualifier(m.getCountry()));
|
||||
|
||||
return org;
|
||||
}
|
||||
|
||||
private static Datasource convertDataSource(OafProtos.Oaf oaf) {
|
||||
final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata();
|
||||
final Datasource datasource = setOaf(new Datasource(), oaf);
|
||||
setEntity(datasource, oaf);
|
||||
datasource.setAccessinfopackage(m.getAccessinfopackageList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setCertificates(mapStringField(m.getCertificates()));
|
||||
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
|
||||
datasource.setContactemail(mapStringField(m.getContactemail()));
|
||||
datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction()));
|
||||
datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype()));
|
||||
datasource.setDataprovider(mapBoolField(m.getDataprovider()));
|
||||
datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype()));
|
||||
datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction()));
|
||||
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
|
||||
datasource.setDatauploadtype(mapStringField(m.getDatauploadtype()));
|
||||
datasource.setDateofvalidation(mapStringField(m.getDateofvalidation()));
|
||||
datasource.setDescription(mapStringField(m.getDescription()));
|
||||
datasource.setEnglishname(mapStringField(m.getEnglishname()));
|
||||
datasource.setLatitude(mapStringField(m.getLatitude()));
|
||||
datasource.setLongitude(mapStringField(m.getLongitude()));
|
||||
datasource.setLogourl(mapStringField(m.getLogourl()));
|
||||
datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl()));
|
||||
datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix()));
|
||||
datasource.setOdcontenttypes(m.getOdcontenttypesList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setOdlanguages(m.getOdlanguagesList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems()));
|
||||
datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate()));
|
||||
datasource.setOdpolicies(mapStringField(m.getOdpolicies()));
|
||||
datasource.setOfficialname(mapStringField(m.getOfficialname()));
|
||||
datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility()));
|
||||
datasource.setPidsystems(mapStringField(m.getPidsystems()));
|
||||
datasource.setPolicies(m.getPoliciesList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapKV)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind()));
|
||||
datasource.setReleaseenddate(mapStringField(m.getReleaseenddate()));
|
||||
datasource.setServiceprovider(mapBoolField(m.getServiceprovider()));
|
||||
datasource.setReleasestartdate(mapStringField(m.getReleasestartdate()));
|
||||
datasource.setSubjects(m.getSubjectsList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setVersioning(mapBoolField(m.getVersioning()));
|
||||
datasource.setWebsiteurl(mapStringField(m.getWebsiteurl()));
|
||||
datasource.setJournal(mapJournal(m.getJournal()));
|
||||
|
||||
|
||||
return datasource;
|
||||
}
|
||||
|
||||
private static Project convertProject(OafProtos.Oaf oaf) {
|
||||
final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata();
|
||||
final Project project = setOaf(new Project(), oaf);
|
||||
setEntity(project, oaf);
|
||||
project.setAcronym(mapStringField(m.getAcronym()));
|
||||
project.setCallidentifier(mapStringField(m.getCallidentifier()));
|
||||
project.setCode(mapStringField(m.getCode()));
|
||||
project.setContactemail(mapStringField(m.getContactemail()));
|
||||
project.setContactfax(mapStringField(m.getContactfax()));
|
||||
project.setContactfullname(mapStringField(m.getContactfullname()));
|
||||
project.setContactphone(mapStringField(m.getContactphone()));
|
||||
project.setContracttype(mapQualifier(m.getContracttype()));
|
||||
project.setCurrency(mapStringField(m.getCurrency()));
|
||||
project.setDuration(mapStringField(m.getDuration()));
|
||||
project.setEcarticle29_3(mapStringField(m.getEcarticle293()));
|
||||
project.setEcsc39(mapStringField(m.getEcsc39()));
|
||||
project.setOamandatepublications(mapStringField(m.getOamandatepublications()));
|
||||
project.setStartdate(mapStringField(m.getStartdate()));
|
||||
project.setEnddate(mapStringField(m.getEnddate()));
|
||||
project.setFundedamount(m.getFundedamount());
|
||||
project.setTotalcost(m.getTotalcost());
|
||||
project.setKeywords(mapStringField(m.getKeywords()));
|
||||
project.setSubjects(m.getSubjectsList().stream()
|
||||
.map(sp -> mapStructuredProperty(sp))
|
||||
.collect(Collectors.toList()));
|
||||
project.setTitle(mapStringField(m.getTitle()));
|
||||
project.setWebsiteurl(mapStringField(m.getWebsiteurl()));
|
||||
project.setFundingtree(m.getFundingtreeList().stream()
|
||||
.map(f -> mapStringField(f))
|
||||
.collect(Collectors.toList()));
|
||||
project.setJsonextrainfo(mapStringField(m.getJsonextrainfo()));
|
||||
project.setSummary(mapStringField(m.getSummary()));
|
||||
project.setOptional1(mapStringField(m.getOptional1()));
|
||||
project.setOptional2(mapStringField(m.getOptional2()));
|
||||
return project;
|
||||
}
|
||||
|
||||
private static Result convertResult(OafProtos.Oaf oaf) {
|
||||
switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) {
|
||||
case "dataset":
|
||||
return createDataset(oaf);
|
||||
case "publication":
|
||||
return createPublication(oaf);
|
||||
case "software":
|
||||
return createSoftware(oaf);
|
||||
case "other":
|
||||
return createORP(oaf);
|
||||
default:
|
||||
Result result = setOaf(new Result(), oaf);
|
||||
setEntity(result, oaf);
|
||||
return setResult(result, oaf);
|
||||
}
|
||||
}
|
||||
|
||||
private static Software createSoftware(OafProtos.Oaf oaf) {
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
Software software = setOaf(new Software(), oaf);
|
||||
setEntity(software, oaf);
|
||||
setResult(software, oaf);
|
||||
|
||||
software.setDocumentationUrl(m.getDocumentationUrlList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
software.setLicense(m.getLicenseList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl()));
|
||||
software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage()));
|
||||
return software;
|
||||
}
|
||||
|
||||
private static OtherResearchProduct createORP(OafProtos.Oaf oaf) {
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf);
|
||||
setEntity(otherResearchProducts, oaf);
|
||||
setResult(otherResearchProducts, oaf);
|
||||
otherResearchProducts.setContactperson(m.getContactpersonList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
otherResearchProducts.setContactgroup(m.getContactgroupList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
otherResearchProducts.setTool(m.getToolList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return otherResearchProducts;
|
||||
}
|
||||
|
||||
private static Publication createPublication(OafProtos.Oaf oaf) {
|
||||
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
Publication publication = setOaf(new Publication(), oaf);
|
||||
setEntity(publication, oaf);
|
||||
setResult(publication, oaf);
|
||||
publication.setJournal(mapJournal(m.getJournal()));
|
||||
return publication;
|
||||
}
|
||||
|
||||
private static Dataset createDataset(OafProtos.Oaf oaf) {
|
||||
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
Dataset dataset = setOaf(new Dataset(), oaf);
|
||||
setEntity(dataset, oaf);
|
||||
setResult(dataset, oaf);
|
||||
dataset.setStoragedate(mapStringField(m.getStoragedate()));
|
||||
dataset.setDevice(mapStringField(m.getDevice()));
|
||||
dataset.setSize(mapStringField(m.getSize()));
|
||||
dataset.setVersion(mapStringField(m.getVersion()));
|
||||
dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate()));
|
||||
dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber()));
|
||||
dataset.setGeolocation(m.getGeolocationList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapGeolocation)
|
||||
.collect(Collectors.toList()));
|
||||
return dataset;
|
||||
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T setOaf(T oaf, OafProtos.Oaf o) {
|
||||
oaf.setDataInfo(mapDataInfo(o.getDataInfo()));
|
||||
oaf.setLastupdatetimestamp(o.getLastupdatetimestamp());
|
||||
return oaf;
|
||||
}
|
||||
|
||||
public static <T extends OafEntity> T setEntity(T entity, OafProtos.Oaf oaf) {
|
||||
//setting Entity fields
|
||||
final OafProtos.OafEntity e = oaf.getEntity();
|
||||
entity.setId(e.getId());
|
||||
entity.setOriginalId(e.getOriginalIdList());
|
||||
entity.setCollectedfrom(e.getCollectedfromList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapKV)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setPid(e.getPidList().stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setDateofcollection(e.getDateofcollection());
|
||||
entity.setDateoftransformation(e.getDateoftransformation());
|
||||
entity.setExtraInfo(e.getExtraInfoList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapExtraInfo)
|
||||
.collect(Collectors.toList()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static <T extends Result> T setResult(T entity, OafProtos.Oaf oaf) {
|
||||
//setting Entity fields
|
||||
final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
entity.setAuthor(m.getAuthorList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapAuthor)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setResulttype(mapQualifier(m.getResulttype()));
|
||||
entity.setLanguage(mapQualifier(m.getLanguage()));
|
||||
entity.setCountry(m.getCountryList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapQualifierAsCountry)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setSubject(m.getSubjectList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setTitle(m.getTitleList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setRelevantdate(m.getRelevantdateList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setDescription(m.getDescriptionList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setDateofacceptance(mapStringField(m.getDateofacceptance()));
|
||||
entity.setPublisher(mapStringField(m.getPublisher()));
|
||||
entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate()));
|
||||
entity.setSource(m.getSourceList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setFulltext(m.getFulltextList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setFormat(m.getFormatList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setContributor(m.getContributorList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setResourcetype(mapQualifier(m.getResourcetype()));
|
||||
entity.setCoverage(m.getCoverageList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setContext(m.getContextList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapContext)
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList()));
|
||||
|
||||
return entity;
|
||||
}
|
||||
|
||||
private static Qualifier getBestAccessRights(List<ResultProtos.Result.Instance> instanceList) {
|
||||
if (instanceList != null) {
|
||||
final Optional<FieldTypeProtos.Qualifier> min = instanceList.stream()
|
||||
.map(i -> i.getAccessright()).min(new LicenseComparator());
|
||||
|
||||
final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier();
|
||||
|
||||
if (StringUtils.isBlank(rights.getClassid())) {
|
||||
rights.setClassid(UNKNOWN);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getClassname()) || UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
||||
rights.setClassname(NOT_AVAILABLE);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getSchemeid())) {
|
||||
rights.setSchemeid(DNET_ACCESS_MODES);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getSchemename())) {
|
||||
rights.setSchemename(DNET_ACCESS_MODES);
|
||||
}
|
||||
|
||||
return rights;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static Context mapContext(ResultProtos.Result.Context context) {
|
||||
|
||||
final Context entity = new Context();
|
||||
entity.setId(context.getId());
|
||||
entity.setDataInfo(context.getDataInfoList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapDataInfo)
|
||||
.collect(Collectors.toList()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
|
||||
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
|
||||
final KeyValue keyValue = new KeyValue();
|
||||
keyValue.setKey(kv.getKey());
|
||||
keyValue.setValue(kv.getValue());
|
||||
keyValue.setDataInfo(mapDataInfo(kv.getDataInfo()));
|
||||
return keyValue;
|
||||
}
|
||||
|
||||
public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) {
|
||||
final DataInfo dataInfo = new DataInfo();
|
||||
dataInfo.setDeletedbyinference(d.getDeletedbyinference());
|
||||
dataInfo.setInferenceprovenance(d.getInferenceprovenance());
|
||||
dataInfo.setInferred(d.getInferred());
|
||||
dataInfo.setInvisible(d.getInvisible());
|
||||
dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction()));
|
||||
dataInfo.setTrust(d.getTrust());
|
||||
return dataInfo;
|
||||
}
|
||||
|
||||
public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) {
|
||||
final Qualifier qualifier = new Qualifier();
|
||||
qualifier.setClassid(q.getClassid());
|
||||
qualifier.setClassname(q.getClassname());
|
||||
qualifier.setSchemeid(q.getSchemeid());
|
||||
qualifier.setSchemename(q.getSchemename());
|
||||
return qualifier;
|
||||
}
|
||||
|
||||
public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) {
|
||||
final Country c = new Country();
|
||||
c.setClassid(q.getClassid());
|
||||
c.setClassname(q.getClassname());
|
||||
c.setSchemeid(q.getSchemeid());
|
||||
c.setSchemename(q.getSchemename());
|
||||
c.setDataInfo(mapDataInfo(q.getDataInfo()));
|
||||
return c;
|
||||
}
|
||||
|
||||
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
|
||||
final StructuredProperty structuredProperty = new StructuredProperty();
|
||||
structuredProperty.setValue(sp.getValue());
|
||||
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
|
||||
structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo()));
|
||||
return structuredProperty;
|
||||
}
|
||||
|
||||
public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) {
|
||||
final ExtraInfo entity = new ExtraInfo();
|
||||
entity.setName(extraInfo.getName());
|
||||
entity.setTypology(extraInfo.getTypology());
|
||||
entity.setProvenance(extraInfo.getProvenance());
|
||||
entity.setTrust(extraInfo.getTrust());
|
||||
entity.setValue(extraInfo.getValue());
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) {
|
||||
final OAIProvenance entity = new OAIProvenance();
|
||||
entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static OriginDescription mapOriginalDescription(FieldTypeProtos.OAIProvenance.OriginDescription originDescription) {
|
||||
final OriginDescription originDescriptionResult = new OriginDescription();
|
||||
originDescriptionResult.setHarvestDate(originDescription.getHarvestDate());
|
||||
originDescriptionResult.setAltered(originDescription.getAltered());
|
||||
originDescriptionResult.setBaseURL(originDescription.getBaseURL());
|
||||
originDescriptionResult.setIdentifier(originDescription.getIdentifier());
|
||||
originDescriptionResult.setDatestamp(originDescription.getDatestamp());
|
||||
originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace());
|
||||
return originDescriptionResult;
|
||||
}
|
||||
|
||||
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
|
||||
final Field<String> stringField = new Field<>();
|
||||
stringField.setValue(s.getValue());
|
||||
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
|
||||
return stringField;
|
||||
}
|
||||
|
||||
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
|
||||
final Field<Boolean> booleanField = new Field<>();
|
||||
booleanField.setValue(b.getValue());
|
||||
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
|
||||
return booleanField;
|
||||
}
|
||||
|
||||
public static Field<Integer> mapIntField(FieldTypeProtos.IntField b) {
|
||||
final Field<Integer> entity = new Field<>();
|
||||
entity.setValue(b.getValue());
|
||||
entity.setDataInfo(mapDataInfo(b.getDataInfo()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static Journal mapJournal(FieldTypeProtos.Journal j) {
|
||||
final Journal journal = new Journal();
|
||||
journal.setConferencedate(j.getConferencedate());
|
||||
journal.setConferenceplace(j.getConferenceplace());
|
||||
journal.setEdition(j.getEdition());
|
||||
journal.setEp(j.getEp());
|
||||
journal.setIss(j.getIss());
|
||||
journal.setIssnLinking(j.getIssnLinking());
|
||||
journal.setIssnOnline(j.getIssnOnline());
|
||||
journal.setIssnPrinted(j.getIssnPrinted());
|
||||
journal.setName(j.getName());
|
||||
journal.setSp(j.getSp());
|
||||
journal.setVol(j.getVol());
|
||||
journal.setDataInfo(mapDataInfo(j.getDataInfo()));
|
||||
return journal;
|
||||
}
|
||||
|
||||
public static Author mapAuthor(FieldTypeProtos.Author author) {
|
||||
final Author entity = new Author();
|
||||
entity.setFullname(author.getFullname());
|
||||
entity.setName(author.getName());
|
||||
entity.setSurname(author.getSurname());
|
||||
entity.setRank(author.getRank());
|
||||
entity.setPid(author.getPidList()
|
||||
.stream()
|
||||
.map(kv -> {
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(kv.getValue());
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(kv.getKey());
|
||||
q.setClassname(kv.getKey());
|
||||
sp.setQualifier(q);
|
||||
return sp;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
entity.setAffiliation(author.getAffiliationList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
return entity;
|
||||
|
||||
}
|
||||
|
||||
public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) {
|
||||
final GeoLocation entity = new GeoLocation();
|
||||
entity.setPoint(geoLocation.getPoint());
|
||||
entity.setBox(geoLocation.getBox());
|
||||
entity.setPlace(geoLocation.getPlace());
|
||||
return entity;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,159 @@
|
|||
package eu.dnetlib.dhp.migration.actions;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
||||
import eu.dnetlib.data.proto.OafProtos;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.LinkedList;
|
||||
|
||||
public class TransformActions implements Serializable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(TransformActions.class);
|
||||
private static final String SEPARATOR = "/";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateActionSet.class.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
new TransformActions().run(parser);
|
||||
}
|
||||
|
||||
private void run(ArgumentApplicationParser parser) throws ISLookUpException, IOException {
|
||||
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: " + isLookupUrl);
|
||||
|
||||
final String inputPaths = parser.get("inputPaths");
|
||||
|
||||
if (StringUtils.isBlank(inputPaths)) {
|
||||
throw new RuntimeException("empty inputPaths");
|
||||
}
|
||||
log.info("inputPaths: " + inputPaths);
|
||||
|
||||
final String targetBaseDir = getTargetBaseDir(isLookupUrl);
|
||||
|
||||
try(SparkSession spark = getSparkSession(parser)) {
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
||||
|
||||
for(String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
|
||||
|
||||
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
|
||||
|
||||
final String rawset = pathQ.pollLast();
|
||||
final String actionSetDirectory = pathQ.pollLast();
|
||||
|
||||
final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset);
|
||||
|
||||
if (fs.exists(targetDirectory)) {
|
||||
log.info(String.format("found target directory '%s", targetDirectory));
|
||||
fs.delete(targetDirectory, true);
|
||||
log.info(String.format("deleted target directory '%s", targetDirectory));
|
||||
}
|
||||
|
||||
log.info(String.format("transforming actions from '%s' to '%s'", sourcePath, targetDirectory));
|
||||
|
||||
sc.sequenceFile(sourcePath, Text.class, Text.class)
|
||||
.mapToPair(a -> new Tuple2<>(a._1(), AtomicAction.fromJSON(a._2().toString())))
|
||||
.mapToPair(a -> new Tuple2<>(a._1(), transformAction(a._1().toString(), a._2())))
|
||||
|
||||
.saveAsHadoopFile(targetDirectory.toString(), Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Text transformAction(String atomicaActionId, AtomicAction aa) throws InvalidProtocolBufferException, JsonProcessingException {
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
if (aa.getTargetValue() != null && aa.getTargetValue().length > 0) {
|
||||
Oaf oaf = ProtoConverter.convert(OafProtos.Oaf.parseFrom(aa.getTargetValue()));
|
||||
aa.setTargetValue(mapper.writeValueAsString(oaf).getBytes());
|
||||
} else {
|
||||
|
||||
if (atomicaActionId.contains("dedupSimilarity")) {
|
||||
|
||||
final String[] splitId = atomicaActionId.split("@");
|
||||
|
||||
String source = splitId[0];
|
||||
String target = splitId[2];
|
||||
|
||||
String[] relSemantic = splitId[1].split("_");
|
||||
|
||||
Relation rel = new Relation();
|
||||
rel.setSource(source);
|
||||
rel.setTarget(target);
|
||||
rel.setRelType(relSemantic[0]);
|
||||
rel.setSubRelType(relSemantic[1]);
|
||||
rel.setRelClass(relSemantic[2]);
|
||||
|
||||
DataInfo d = new DataInfo();
|
||||
d.setDeletedbyinference(false);
|
||||
d.setInferenceprovenance("deduplication");
|
||||
d.setInferred(true);
|
||||
d.setInvisible(false);
|
||||
Qualifier provenanceaction = new Qualifier();
|
||||
|
||||
provenanceaction.setClassid("deduplication");
|
||||
provenanceaction.setClassname("deduplication");
|
||||
provenanceaction.setSchemeid("dnet:provenanceActions");
|
||||
provenanceaction.setSchemename("dnet:provenanceActions");
|
||||
|
||||
d.setProvenanceaction(provenanceaction);
|
||||
|
||||
rel.setDataInfo(d);
|
||||
|
||||
aa.setTargetValue(mapper.writeValueAsString(rel).getBytes());
|
||||
}
|
||||
}
|
||||
|
||||
return new Text(mapper.writeValueAsString(aa));
|
||||
}
|
||||
|
||||
private String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
|
||||
return isLookUp.getResourceProfileByQuery(XQUERY);
|
||||
}
|
||||
|
||||
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
return SparkSession
|
||||
.builder()
|
||||
.appName(TransformActions.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.config(conf)
|
||||
.enableHiveSupport()
|
||||
.getOrCreate();
|
||||
}
|
||||
}
|
|
@ -1,4 +1,14 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
package eu.dnetlib.dhp.migration.step1;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.asString;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listKeyValues;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
@ -17,18 +27,26 @@ import org.apache.commons.logging.Log;
|
|||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
|
||||
import eu.dnetlib.dhp.migration.utils.DbClient;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor implements Closeable {
|
||||
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
|
||||
|
||||
private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
|
||||
qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");
|
||||
|
@ -50,32 +68,36 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
|
|||
final String dbPassword = parser.get("postgresPassword");
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("namenode");
|
||||
final String hdfsUser = parser.get("hdfsUser");
|
||||
|
||||
try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, hdfsNameNode, hdfsUser, dbUrl, dbUser, dbPassword)) {
|
||||
log.info("Processing datasources...");
|
||||
smdbe.execute("queryDatasources.sql", smdbe::processDatasource);
|
||||
final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims");
|
||||
|
||||
log.info("Processing projects...");
|
||||
smdbe.execute("queryProjects.sql", smdbe::processProject);
|
||||
try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) {
|
||||
if (processClaims) {
|
||||
log.info("Processing claims...");
|
||||
smdbe.execute("queryClaims.sql", smdbe::processClaims);
|
||||
} else {
|
||||
log.info("Processing datasources...");
|
||||
smdbe.execute("queryDatasources.sql", smdbe::processDatasource);
|
||||
|
||||
log.info("Processing orgs...");
|
||||
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);
|
||||
log.info("Processing projects...");
|
||||
smdbe.execute("queryProjects.sql", smdbe::processProject);
|
||||
|
||||
log.info("Processing relations ds <-> orgs ...");
|
||||
smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization);
|
||||
log.info("Processing orgs...");
|
||||
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);
|
||||
|
||||
log.info("Processing projects <-> orgs ...");
|
||||
smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization);
|
||||
log.info("Processing relations ds <-> orgs ...");
|
||||
smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization);
|
||||
|
||||
log.info("Processing projects <-> orgs ...");
|
||||
smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization);
|
||||
}
|
||||
log.info("All done.");
|
||||
}
|
||||
}
|
||||
|
||||
public MigrateDbEntitiesApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String dbUrl, final String dbUser,
|
||||
public MigrateDbEntitiesApplication(final String hdfsPath, final String dbUrl, final String dbUser,
|
||||
final String dbPassword) throws Exception {
|
||||
super(hdfsPath, hdfsNameNode, hdfsUser);
|
||||
super(hdfsPath);
|
||||
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
|
||||
this.lastUpdateTimestamp = new Date().getTime();
|
||||
}
|
||||
|
@ -93,7 +115,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
|
|||
|
||||
final Datasource ds = new Datasource();
|
||||
|
||||
ds.setId(createOpenaireId(10, rs.getString("datasourceid")));
|
||||
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
|
||||
ds.setOriginalId(Arrays.asList(rs.getString("datasourceid")));
|
||||
ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
|
||||
ds.setPid(new ArrayList<>());
|
||||
|
@ -200,7 +222,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
|
|||
|
||||
final Project p = new Project();
|
||||
|
||||
p.setId(createOpenaireId(40, rs.getString("projectid")));
|
||||
p.setId(createOpenaireId(40, rs.getString("projectid"), true));
|
||||
p.setOriginalId(Arrays.asList(rs.getString("projectid")));
|
||||
p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
|
||||
p.setPid(new ArrayList<>());
|
||||
|
@ -290,7 +312,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
|
|||
|
||||
final Organization o = new Organization();
|
||||
|
||||
o.setId(createOpenaireId(20, rs.getString("organizationid")));
|
||||
o.setId(createOpenaireId(20, rs.getString("organizationid"), true));
|
||||
o.setOriginalId(Arrays.asList(rs.getString("organizationid")));
|
||||
o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
|
||||
o.setPid(new ArrayList<>());
|
||||
|
@ -354,8 +376,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
|
|||
|
||||
try {
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
final String orgId = createOpenaireId(20, rs.getString("organization"));
|
||||
final String dsId = createOpenaireId(10, rs.getString("datasource"));
|
||||
final String orgId = createOpenaireId(20, rs.getString("organization"), true);
|
||||
final String dsId = createOpenaireId(10, rs.getString("datasource"), true);
|
||||
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
|
@ -377,7 +399,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
|
|||
r2.setTarget(dsId);
|
||||
r2.setCollectedFrom(collectedFrom);
|
||||
r2.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
emitOaf(r2);
|
||||
|
||||
// rs.getString("datasource");
|
||||
|
@ -403,8 +425,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
|
|||
|
||||
try {
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
final String orgId = createOpenaireId(20, rs.getString("resporganization"));
|
||||
final String projectId = createOpenaireId(40, rs.getString("project"));
|
||||
final String orgId = createOpenaireId(20, rs.getString("resporganization"), true);
|
||||
final String projectId = createOpenaireId(40, rs.getString("project"), true);
|
||||
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
|
@ -426,7 +448,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
|
|||
r2.setTarget(projectId);
|
||||
r2.setCollectedFrom(collectedFrom);
|
||||
r2.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
emitOaf(r2);
|
||||
|
||||
// rs.getString("project");
|
||||
|
@ -450,6 +472,81 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
|
|||
}
|
||||
}
|
||||
|
||||
public void processClaims(final ResultSet rs) {
|
||||
|
||||
final DataInfo info =
|
||||
dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9");
|
||||
|
||||
try {
|
||||
|
||||
if (rs.getString("source_type").equals("context")) {
|
||||
final Result r;
|
||||
|
||||
if (rs.getString("target_type").equals("dataset")) {
|
||||
r = new Dataset();
|
||||
} else if (rs.getString("target_type").equals("software")) {
|
||||
r = new Software();
|
||||
} else if (rs.getString("target_type").equals("other")) {
|
||||
r = new OtherResearchProduct();
|
||||
} else {
|
||||
r = new Publication();
|
||||
}
|
||||
r.setId(createOpenaireId(50, rs.getString("target_id"), false));
|
||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
r.setContext(prepareContext(rs.getString("source_id"), info));
|
||||
r.setDataInfo(info);
|
||||
emitOaf(r);
|
||||
} else {
|
||||
final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false);
|
||||
final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false);
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
final Relation r2 = new Relation();
|
||||
|
||||
if (rs.getString("source_type").equals("project")) {
|
||||
r1.setRelType("resultProject");
|
||||
r1.setSubRelType("outcome");
|
||||
r1.setRelClass("produces");
|
||||
|
||||
r2.setRelType("resultProject");
|
||||
r2.setSubRelType("outcome");
|
||||
r2.setRelClass("isProducedBy");
|
||||
} else {
|
||||
r1.setRelType("resultResult");
|
||||
r1.setSubRelType("relationship");
|
||||
r1.setRelClass("isRelatedTo");
|
||||
|
||||
r2.setRelType("resultResult");
|
||||
r2.setSubRelType("relationship");
|
||||
r2.setRelClass("isRelatedTo");
|
||||
}
|
||||
|
||||
r1.setSource(sourceId);
|
||||
r1.setTarget(targetId);
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
emitOaf(r1);
|
||||
|
||||
r2.setSource(targetId);
|
||||
r2.setTarget(sourceId);
|
||||
r2.setDataInfo(info);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
emitOaf(r2);
|
||||
|
||||
}
|
||||
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<Context> prepareContext(final String id, final DataInfo dataInfo) {
|
||||
final Context context = new Context();
|
||||
context.setId(id);
|
||||
context.setDataInfo(Arrays.asList(dataInfo));
|
||||
return Arrays.asList(context);
|
||||
}
|
||||
|
||||
private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException {
|
||||
final Boolean deletedbyinference = rs.getBoolean("deletedbyinference");
|
||||
final String inferenceprovenance = rs.getString("inferenceprovenance");
|
|
@ -0,0 +1,67 @@
|
|||
package eu.dnetlib.dhp.migration.step1;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
|
||||
import eu.dnetlib.dhp.migration.utils.MdstoreClient;
|
||||
|
||||
public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class);
|
||||
|
||||
private final MdstoreClient mdstoreClient;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String mongoBaseUrl = parser.get("mongoBaseUrl");
|
||||
final String mongoDb = parser.get("mongoDb");
|
||||
|
||||
final String mdFormat = parser.get("mdFormat");
|
||||
final String mdLayout = parser.get("mdLayout");
|
||||
final String mdInterpretation = parser.get("mdInterpretation");
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
|
||||
try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) {
|
||||
app.execute(mdFormat, mdLayout, mdInterpretation);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public MigrateMongoMdstoresApplication(final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception {
|
||||
super(hdfsPath);
|
||||
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
|
||||
}
|
||||
|
||||
public void execute(final String format, final String layout, final String interpretation) {
|
||||
final Map<String, String> colls = mdstoreClient.validCollections(format, layout, interpretation);
|
||||
log.info("Found " + colls.size() + " mdstores");
|
||||
|
||||
for (final Entry<String, String> entry : colls.entrySet()) {
|
||||
log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")");
|
||||
final String currentColl = entry.getValue();
|
||||
|
||||
for (final String xml : mdstoreClient.listRecords(currentColl)) {
|
||||
emit(xml, "native_" + format);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
mdstoreClient.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,20 +1,24 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.keyValue;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.oaiIProvenance;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentFactory;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Node;
|
||||
|
@ -37,11 +41,9 @@ import eu.dnetlib.dhp.schema.oaf.Result;
|
|||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
||||
public abstract class AbstractMdRecordToOafMapper {
|
||||
|
||||
protected final Map<String, String> code2name = new HashMap<>();
|
||||
|
||||
protected final MdstoreClient mdstoreClient;
|
||||
protected final Map<String, String> code2name;
|
||||
|
||||
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||
|
||||
|
@ -51,79 +53,36 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
|
||||
protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
|
||||
|
||||
private static final Log log = LogFactory.getLog(AbstractMongoExecutor.class);
|
||||
|
||||
public AbstractMongoExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl,
|
||||
final String mongoDb, final String dbUrl, final String dbUser,
|
||||
final String dbPassword) throws Exception {
|
||||
|
||||
super(hdfsPath, hdfsNameNode, hdfsUser);
|
||||
|
||||
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
|
||||
loadClassNames(dbUrl, dbUser, dbPassword);
|
||||
|
||||
final Map<String, String> nsContext = new HashMap<>();
|
||||
|
||||
registerNamespaces(nsContext);
|
||||
|
||||
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
||||
protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) {
|
||||
this.code2name = code2name;
|
||||
}
|
||||
|
||||
private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
|
||||
public List<Oaf> processMdRecord(final String xml) {
|
||||
try {
|
||||
final Map<String, String> nsContext = new HashMap<>();
|
||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
|
||||
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
|
||||
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
|
||||
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
||||
nsContext.put("datacite", "http://datacite.org/schema/kernel-3");
|
||||
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
||||
|
||||
log.info("Loading vocabulary terms from db...");
|
||||
final Document doc = DocumentHelper.parseText(xml);
|
||||
|
||||
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||
code2name.clear();
|
||||
dbClient.processResults("select code, name from class", rs -> {
|
||||
try {
|
||||
code2name.put(rs.getString("code"), rs.getString("name"));
|
||||
} catch (final SQLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
||||
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
|
||||
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
|
||||
: keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name"));
|
||||
|
||||
final DataInfo info = prepareDataInfo(doc);
|
||||
final long lastUpdateTimestamp = new Date().getTime();
|
||||
|
||||
return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
log.info("Found " + code2name.size() + " terms.");
|
||||
|
||||
}
|
||||
|
||||
public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException {
|
||||
|
||||
log.info(String.format("Searching mdstores (format: %s, layout: %s, interpretation: %s)", mdFormat, mdLayout, mdInterpretation));
|
||||
|
||||
final Map<String, String> colls = mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation);
|
||||
log.info("Found " + colls.size() + " mdstores");
|
||||
|
||||
for (final Entry<String, String> entry : colls.entrySet()) {
|
||||
log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")");
|
||||
final String currentColl = entry.getValue();
|
||||
|
||||
for (final String xml : mdstoreClient.listRecords(currentColl)) {
|
||||
final Document doc = DocumentHelper.parseText(xml);
|
||||
|
||||
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
||||
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
|
||||
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
|
||||
: keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name"));
|
||||
|
||||
final DataInfo info = prepareDataInfo(doc);
|
||||
final long lastUpdateTimestamp = new Date().getTime();
|
||||
|
||||
for (final Oaf oaf : createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp)) {
|
||||
emitOaf(oaf);
|
||||
}
|
||||
}
|
||||
}
|
||||
log.info("All Done.");
|
||||
}
|
||||
|
||||
protected void registerNamespaces(final Map<String, String> nsContext) {
|
||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
|
||||
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
|
||||
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
|
||||
}
|
||||
|
||||
protected List<Oaf> createOafs(final Document doc,
|
||||
|
@ -194,10 +153,10 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
|
||||
final List<Oaf> res = new ArrayList<>();
|
||||
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
|
||||
|
||||
for (final Object o : doc.selectNodes("//oaf:projectid")) {
|
||||
final String projectId = createOpenaireId(40, ((Node) o).getText());
|
||||
final String projectId = createOpenaireId(40, ((Node) o).getText(), true);
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
r1.setRelType("resultProject");
|
||||
|
@ -238,7 +197,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
final long lastUpdateTimestamp) {
|
||||
r.setDataInfo(info);
|
||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier")));
|
||||
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
|
||||
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
|
||||
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
||||
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
|
||||
|
@ -398,6 +357,8 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
protected DataInfo prepareDataInfo(final Document doc) {
|
||||
final Node n = doc.selectSingleNode("//oaf:datainfo");
|
||||
|
||||
if (n == null) { return null; }
|
||||
|
||||
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
|
||||
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
|
||||
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
|
||||
|
@ -430,10 +391,4 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
mdstoreClient.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,172 @@
|
|||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
|
||||
import eu.dnetlib.dhp.migration.utils.DbClient;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateEntitiesApplication {
|
||||
|
||||
private static final Log log = LogFactory.getLog(GenerateEntitiesApplication.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/migration/generate_entities_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String sourcePaths = parser.get("sourcePaths");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
|
||||
final String dbUrl = parser.get("postgresUrl");
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
|
||||
final Map<String, String> code2name = loadClassNames(dbUrl, dbUser, dbPassword);
|
||||
|
||||
try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
|
||||
final List<String> existingSourcePaths = Arrays.stream(sourcePaths.split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList());
|
||||
generateEntities(sc, code2name, existingSourcePaths, targetPath);
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.appName(GenerateEntitiesApplication.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
private static void generateEntities(final JavaSparkContext sc,
|
||||
final Map<String, String> code2name,
|
||||
final List<String> sourcePaths,
|
||||
final String targetPath) {
|
||||
|
||||
log.info("Generate entities from files:");
|
||||
sourcePaths.forEach(log::info);
|
||||
|
||||
JavaRDD<String> inputRdd = sc.emptyRDD();
|
||||
|
||||
for (final String sp : sourcePaths) {
|
||||
inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class)
|
||||
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
||||
.map(k -> convertToListOaf(k._1(), k._2(), code2name))
|
||||
.flatMap(list -> list.iterator())
|
||||
.map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + "|" + convertToJson(oaf)));
|
||||
}
|
||||
|
||||
inputRdd.saveAsTextFile(targetPath);
|
||||
|
||||
}
|
||||
|
||||
private static List<Oaf> convertToListOaf(final String id, final String s, final Map<String, String> code2name) {
|
||||
final String type = StringUtils.substringAfter(id, ":");
|
||||
|
||||
switch (type.toLowerCase()) {
|
||||
case "native_oaf":
|
||||
return new OafToOafMapper(code2name).processMdRecord(s);
|
||||
case "native_odf":
|
||||
return new OdfToOafMapper(code2name).processMdRecord(s);
|
||||
case "datasource":
|
||||
return Arrays.asList(convertFromJson(s, Datasource.class));
|
||||
case "organization":
|
||||
return Arrays.asList(convertFromJson(s, Organization.class));
|
||||
case "project":
|
||||
return Arrays.asList(convertFromJson(s, Project.class));
|
||||
case "relation":
|
||||
return Arrays.asList(convertFromJson(s, Relation.class));
|
||||
case "publication":
|
||||
return Arrays.asList(convertFromJson(s, Publication.class));
|
||||
case "dataset":
|
||||
return Arrays.asList(convertFromJson(s, Dataset.class));
|
||||
case "software":
|
||||
return Arrays.asList(convertFromJson(s, Software.class));
|
||||
case "otherresearchproducts":
|
||||
default:
|
||||
return Arrays.asList(convertFromJson(s, OtherResearchProduct.class));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static Map<String, String> loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
|
||||
|
||||
log.info("Loading vocabulary terms from db...");
|
||||
|
||||
final Map<String, String> map = new HashMap<>();
|
||||
|
||||
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||
dbClient.processResults("select code, name from class", rs -> {
|
||||
try {
|
||||
map.put(rs.getString("code"), rs.getString("name"));
|
||||
} catch (final SQLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
log.info("Found " + map.size() + " terms.");
|
||||
|
||||
return map;
|
||||
|
||||
}
|
||||
|
||||
private static String convertToJson(final Oaf oaf) {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(oaf);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static Oaf convertFromJson(final String s, final Class<? extends Oaf> clazz) {
|
||||
try {
|
||||
return new ObjectMapper().readValue(s, clazz);
|
||||
} catch (final Exception e) {
|
||||
log.error("Error parsing object of class: " + clazz);
|
||||
log.error(s);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean exists(final JavaSparkContext context, final String pathToFile) {
|
||||
try {
|
||||
final FileSystem hdfs = org.apache.hadoop.fs.FileSystem.get(context.hadoopConfiguration());
|
||||
final Path path = new Path(pathToFile);
|
||||
return hdfs.exists(path);
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,16 +1,17 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.migration.pace.PacePerson;
|
||||
import eu.dnetlib.dhp.migration.utils.PacePerson;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
|
@ -22,20 +23,10 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class OafMigrationExecutor extends AbstractMongoExecutor {
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
private static final Log log = LogFactory.getLog(OafMigrationExecutor.class);
|
||||
|
||||
public OafMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
|
||||
final String dbUrl, final String dbUser,
|
||||
final String dbPassword) throws Exception {
|
||||
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void registerNamespaces(final Map<String, String> nsContext) {
|
||||
super.registerNamespaces(nsContext);
|
||||
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
||||
public OafToOafMapper(final Map<String, String> code2name) {
|
||||
super(code2name);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -211,12 +202,12 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
|
|||
final KeyValue collectedFrom,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
|
||||
|
||||
final List<Oaf> res = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
|
||||
final String otherId = createOpenaireId(50, ((Node) o).getText());
|
||||
final String otherId = createOpenaireId(50, ((Node) o).getText(), false);
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
r1.setRelType("resultResult");
|
|
@ -1,4 +1,8 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
@ -6,8 +10,6 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
|
@ -22,38 +24,28 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
||||
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
private static final Log log = LogFactory.getLog(OdfMigrationExecutor.class);
|
||||
|
||||
public OdfMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
|
||||
final String dbUrl, final String dbUser,
|
||||
final String dbPassword) throws Exception {
|
||||
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void registerNamespaces(final Map<String, String> nsContext) {
|
||||
super.registerNamespaces(nsContext);
|
||||
nsContext.put("dc", "http://datacite.org/schema/kernel-3");
|
||||
public OdfToOafMapper(final Map<String, String> code2name) {
|
||||
super(code2name);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
|
||||
return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
|
||||
final List<Author> res = new ArrayList<>();
|
||||
int pos = 1;
|
||||
for (final Object o : doc.selectNodes("//dc:creator")) {
|
||||
for (final Object o : doc.selectNodes("//datacite:creator")) {
|
||||
final Node n = (Node) o;
|
||||
final Author author = new Author();
|
||||
author.setFullname(n.valueOf("./dc:creatorName"));
|
||||
author.setName(n.valueOf("./dc:givenName"));
|
||||
author.setSurname(n.valueOf("./dc:familyName"));
|
||||
author.setAffiliation(prepareListFields(doc, "./dc:affiliation", info));
|
||||
author.setFullname(n.valueOf("./datacite:creatorName"));
|
||||
author.setName(n.valueOf("./datacite:givenName"));
|
||||
author.setSurname(n.valueOf("./datacite:familyName"));
|
||||
author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info));
|
||||
author.setPid(preparePids(doc, info));
|
||||
author.setRank(pos++);
|
||||
res.add(author);
|
||||
|
@ -63,7 +55,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
|
||||
private List<StructuredProperty> preparePids(final Document doc, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("./dc:nameIdentifier")) {
|
||||
for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) {
|
||||
res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info));
|
||||
}
|
||||
return res;
|
||||
|
@ -72,7 +64,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
@Override
|
||||
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
|
||||
final List<Instance> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//dc:alternateIdentifier[@alternateIdentifierType='URL']")) {
|
||||
for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
|
||||
final Instance instance = new Instance();
|
||||
instance.setUrl(Arrays.asList(((Node) o).getText().trim()));
|
||||
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
|
||||
|
@ -98,7 +90,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
@Override
|
||||
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//dc:date")) {
|
||||
for (final Object o : doc.selectNodes("//datacite:date")) {
|
||||
final String dateType = ((Node) o).valueOf("@dateType");
|
||||
if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued")
|
||||
&& !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) {
|
||||
|
@ -115,32 +107,32 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
|
||||
@Override
|
||||
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:contributorName", info);
|
||||
return prepareListFields(doc, "//datacite:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:format", info);
|
||||
return prepareListFields(doc, "//datacite:format", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:publisher", info);
|
||||
return prepareField(doc, "//datacite:publisher", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:description[@descriptionType='Abstract']", info);
|
||||
return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:subject", info);
|
||||
return prepareListStructProps(doc, "//datacite:subject", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareLanguages(final Document doc) {
|
||||
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
|
||||
return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -150,17 +142,17 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactGroup']/dc:contributorName", info);
|
||||
return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactPerson']/dc:contributorName", info);
|
||||
return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
||||
return prepareQualifier(doc, "//dc:format", "dnet:programming_languages", "dnet:programming_languages");
|
||||
return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -175,7 +167,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
|
||||
return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
|
||||
}
|
||||
|
||||
// DATASETS
|
||||
|
@ -184,11 +176,11 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
|
||||
final List<GeoLocation> res = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//dc:geoLocation")) {
|
||||
for (final Object o : doc.selectNodes("//datacite:geoLocation")) {
|
||||
final GeoLocation loc = new GeoLocation();
|
||||
loc.setBox(((Node) o).valueOf("./dc:geoLocationBox"));
|
||||
loc.setPlace(((Node) o).valueOf("./dc:geoLocationPlace"));
|
||||
loc.setPoint(((Node) o).valueOf("./dc:geoLocationPoint"));
|
||||
loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox"));
|
||||
loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace"));
|
||||
loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint"));
|
||||
res.add(loc);
|
||||
}
|
||||
return res;
|
||||
|
@ -201,17 +193,17 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:date[@dateType='Updated']", info);
|
||||
return prepareField(doc, "//datacite:date[@dateType='Updated']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:version", info);
|
||||
return prepareField(doc, "//datacite:version", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:size", info);
|
||||
return prepareField(doc, "//datacite:size", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -221,18 +213,18 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:date[@dateType='Issued']", info);
|
||||
return prepareField(doc, "//datacite:date[@dateType='Issued']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Oaf> addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
|
||||
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
|
||||
|
||||
final List<Oaf> res = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//*[local-name() = 'resource']//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) {
|
||||
final String otherId = createOpenaireId(50, ((Node) o).getText());
|
||||
for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) {
|
||||
final String otherId = createOpenaireId(50, ((Node) o).getText(), false);
|
||||
final String type = ((Node) o).valueOf("@relationType");
|
||||
|
||||
if (type.equals("IsSupplementTo")) {
|
|
@ -0,0 +1,70 @@
|
|||
package eu.dnetlib.dhp.migration.step3;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class DispatchEntitiesApplication {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DispatchEntitiesApplication.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
|
||||
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
final String targetPath = parser.get("graphRawPath");
|
||||
|
||||
processEntity(sc, Publication.class, sourcePath, targetPath);
|
||||
processEntity(sc, Dataset.class, sourcePath, targetPath);
|
||||
processEntity(sc, Software.class, sourcePath, targetPath);
|
||||
processEntity(sc, OtherResearchProduct.class, sourcePath, targetPath);
|
||||
processEntity(sc, Datasource.class, sourcePath, targetPath);
|
||||
processEntity(sc, Organization.class, sourcePath, targetPath);
|
||||
processEntity(sc, Project.class, sourcePath, targetPath);
|
||||
processEntity(sc, Relation.class, sourcePath, targetPath);
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.appName(DispatchEntitiesApplication.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
private static void processEntity(final JavaSparkContext sc, final Class<?> clazz, final String sourcePath, final String targetPath) {
|
||||
final String type = clazz.getSimpleName().toLowerCase();
|
||||
|
||||
log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath));
|
||||
|
||||
sc.textFile(sourcePath)
|
||||
.filter(l -> isEntityType(l, type))
|
||||
.map(l -> StringUtils.substringAfter(l, "|"))
|
||||
.saveAsTextFile(targetPath + "/" + type); // use repartition(XXX) ???
|
||||
}
|
||||
|
||||
private static boolean isEntityType(final String line, final String type) {
|
||||
return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
|
||||
public class AbstractMigrationApplication implements Closeable {
|
||||
|
||||
private final AtomicInteger counter = new AtomicInteger(0);
|
||||
|
||||
private final Text key = new Text();
|
||||
|
||||
private final Text value = new Text();
|
||||
|
||||
private final SequenceFile.Writer writer;
|
||||
|
||||
private final ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class);
|
||||
|
||||
public AbstractMigrationApplication(final String hdfsPath) throws Exception {
|
||||
|
||||
log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath));
|
||||
|
||||
this.writer = SequenceFile.createWriter(getConf(), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
|
||||
.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class));
|
||||
}
|
||||
|
||||
private Configuration getConf() throws IOException {
|
||||
final Configuration conf = new Configuration();
|
||||
/*
|
||||
* conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
* conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser);
|
||||
* System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf);
|
||||
*/
|
||||
return conf;
|
||||
}
|
||||
|
||||
protected void emit(final String s, final String type) {
|
||||
try {
|
||||
key.set(counter.getAndIncrement() + ":" + type);
|
||||
value.set(s);
|
||||
writer.append(key, value);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
protected void emitOaf(final Oaf oaf) {
|
||||
try {
|
||||
emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase());
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public ObjectMapper getObjectMapper() {
|
||||
return objectMapper;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
writer.hflush();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
@ -28,8 +28,8 @@ public class DbClient implements Closeable {
|
|||
StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address);
|
||||
this.connection.setAutoCommit(false);
|
||||
} catch (final Exception e) {
|
||||
log.error(e.getClass().getName() + ": " + e.getMessage());
|
||||
throw new RuntimeException(e);
|
||||
log.error("Connection to postgresDB failed");
|
||||
throw new RuntimeException("Connection to postgresDB failed", e);
|
||||
}
|
||||
log.info("Opened database successfully");
|
||||
}
|
||||
|
@ -44,10 +44,12 @@ public class DbClient implements Closeable {
|
|||
consumer.accept(rs);
|
||||
}
|
||||
} catch (final SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
log.error("Error executing sql query: " + sql, e);
|
||||
throw new RuntimeException("Error executing sql query", e);
|
||||
}
|
||||
} catch (final SQLException e1) {
|
||||
throw new RuntimeException(e1);
|
||||
log.error("Error preparing sql statement", e1);
|
||||
throw new RuntimeException("Error preparing sql statement", e1);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
|
@ -1,24 +1,12 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
|
||||
|
@ -26,60 +14,12 @@ import eu.dnetlib.dhp.schema.oaf.Field;
|
|||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class AbstractMigrationExecutor implements Closeable {
|
||||
|
||||
private final AtomicInteger counter = new AtomicInteger(0);
|
||||
|
||||
private final Text key = new Text();
|
||||
|
||||
private final Text value = new Text();
|
||||
|
||||
private final ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
private final SequenceFile.Writer writer;
|
||||
|
||||
private static final Log log = LogFactory.getLog(AbstractMigrationExecutor.class);
|
||||
|
||||
public AbstractMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception {
|
||||
|
||||
log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s, nameNode=%s, user=%s", hdfsPath, hdfsNameNode, hdfsUser));
|
||||
|
||||
this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
|
||||
.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class));
|
||||
}
|
||||
|
||||
private Configuration getConf(final String hdfsNameNode, final String hdfsUser) throws IOException {
|
||||
final Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
System.setProperty("HADOOP_USER_NAME", hdfsUser);
|
||||
System.setProperty("hadoop.home.dir", "/");
|
||||
FileSystem.get(URI.create(hdfsNameNode), conf);
|
||||
return conf;
|
||||
}
|
||||
|
||||
protected void emitOaf(final Oaf oaf) {
|
||||
try {
|
||||
key.set(counter.getAndIncrement() + ":" + oaf.getClass().getSimpleName().toLowerCase());
|
||||
value.set(objectMapper.writeValueAsString(oaf));
|
||||
writer.append(key, value);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
writer.hflush();
|
||||
writer.close();
|
||||
}
|
||||
public class OafMapperUtils {
|
||||
|
||||
public static KeyValue keyValue(final String k, final String v) {
|
||||
final KeyValue kv = new KeyValue();
|
||||
|
@ -223,14 +163,33 @@ public class AbstractMigrationExecutor implements Closeable {
|
|||
return d;
|
||||
}
|
||||
|
||||
public static String createOpenaireId(final int prefix, final String originalId) {
|
||||
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
|
||||
final String rest = StringUtils.substringAfter(originalId, "::");
|
||||
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
|
||||
public static String createOpenaireId(final int prefix, final String originalId, final boolean to_md5) {
|
||||
if (to_md5) {
|
||||
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
|
||||
final String rest = StringUtils.substringAfter(originalId, "::");
|
||||
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
|
||||
} else {
|
||||
return String.format("%s|%s", prefix, originalId);
|
||||
}
|
||||
}
|
||||
|
||||
public static String createOpenaireId(final String type, final String originalId, final boolean to_md5) {
|
||||
switch (type) {
|
||||
case "datasource":
|
||||
return createOpenaireId(10, originalId, to_md5);
|
||||
case "organization":
|
||||
return createOpenaireId(20, originalId, to_md5);
|
||||
case "person":
|
||||
return createOpenaireId(30, originalId, to_md5);
|
||||
case "project":
|
||||
return createOpenaireId(40, originalId, to_md5);
|
||||
default:
|
||||
return createOpenaireId(50, originalId, to_md5);
|
||||
}
|
||||
}
|
||||
|
||||
public static String asString(final Object o) {
|
||||
return o == null ? "" : o.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package eu.dnetlib.dhp.migration.pace;
|
||||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Normalizer;
|
|
@ -2,7 +2,7 @@
|
|||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the HDFS source path which contains the sequential file",
|
||||
"paramDescription": "the source path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
@ -16,11 +16,5 @@
|
|||
"paramLongName": "graphRawPath",
|
||||
"paramDescription": "the path of the graph Raw in hdfs",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "e",
|
||||
"paramLongName": "entity",
|
||||
"paramDescription": "The entity to extract",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,39 @@
|
|||
[
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePaths",
|
||||
"paramDescription": "the HDFS source paths which contains the sequential file (comma separated)",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
||||
"paramDescription": "the path of the target file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pgurl",
|
||||
"paramLongName": "postgresUrl",
|
||||
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pguser",
|
||||
"paramLongName": "postgresUser",
|
||||
"paramDescription": "postgres user",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "pgpasswd",
|
||||
"paramLongName": "postgresPassword",
|
||||
"paramDescription": "postgres password",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
|
@ -0,0 +1,10 @@
|
|||
[
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true},
|
||||
{"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true},
|
||||
{"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true},
|
||||
{"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true},
|
||||
{"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true},
|
||||
{"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true}
|
||||
]
|
|
@ -6,33 +6,27 @@
|
|||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "namenode",
|
||||
"paramDescription": "the Name Node URI",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "u",
|
||||
"paramLongName": "hdfsUser",
|
||||
"paramDescription": "the user wich create the hdfs seq file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dburl",
|
||||
"paramName": "pgurl",
|
||||
"paramLongName": "postgresUrl",
|
||||
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dbuser",
|
||||
"paramName": "pguser",
|
||||
"paramLongName": "postgresUser",
|
||||
"paramDescription": "postgres user",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "dbpasswd",
|
||||
"paramName": "pgpasswd",
|
||||
"paramLongName": "postgresPassword",
|
||||
"paramDescription": "postgres password",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "a",
|
||||
"paramLongName": "action",
|
||||
"paramDescription": "process claims",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -5,18 +5,6 @@
|
|||
"paramDescription": "the path where storing the sequential file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "namenode",
|
||||
"paramDescription": "the Name Node URI",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "u",
|
||||
"paramLongName": "hdfsUser",
|
||||
"paramDescription": "the user wich create the hdfs seq file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mongourl",
|
||||
"paramLongName": "mongoBaseUrl",
|
||||
|
@ -24,7 +12,7 @@
|
|||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "db",
|
||||
"paramName": "mongodb",
|
||||
"paramLongName": "mongoDb",
|
||||
"paramDescription": "mongo database",
|
||||
"paramRequired": true
|
||||
|
@ -46,23 +34,5 @@
|
|||
"paramLongName": "mdInterpretation",
|
||||
"paramDescription": "metadata interpretation",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pgurl",
|
||||
"paramLongName": "postgresUrl",
|
||||
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pguser",
|
||||
"paramLongName": "postgresUser",
|
||||
"paramDescription": "postgres user",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "pgpasswd",
|
||||
"paramLongName": "postgresPassword",
|
||||
"paramDescription": "postgres password",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -1,282 +0,0 @@
|
|||
<workflow-app name="import Entities from aggretor to HDFS" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the base path to store hdfs file</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>graphRawPath</name>
|
||||
<description>the graph Raw base path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongourl</name>
|
||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoDb</name>
|
||||
<description>mongo database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ImportODFEntitiesFromMongoDB"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}'/>
|
||||
<mkdir path='${workingPath}'/>
|
||||
</fs>
|
||||
<ok to="ImportEntitiesFromPostgres"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportEntitiesFromPostgres">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/db_entities</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-u</arg><arg>${hdfsUser}</arg>
|
||||
<arg>-dburl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-dbuser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-dbpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ImportODFEntitiesFromMongoDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODFEntitiesFromMongoDB">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/odf_entities</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-u</arg><arg>${hdfsUser}</arg>
|
||||
<arg>-mongourl</arg><arg>${mongourl}</arg>
|
||||
<arg>-db</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ImportOAFEntitiesFromMongoDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAFEntitiesFromMongoDB">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/oaf_entities</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-u</arg><arg>${hdfsUser}</arg>
|
||||
<arg>-mongourl</arg><arg>${mongourl}</arg>
|
||||
<arg>-db</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ExtractPublication"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractPublication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: publication</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/publication</arg>
|
||||
<arg>-e</arg><arg>publication</arg>
|
||||
</spark>
|
||||
<ok to="ExtractDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: dataset</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/dataset</arg>
|
||||
<arg>-e</arg><arg>dataset</arg>
|
||||
</spark>
|
||||
<ok to="ExtractSoftware"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractSoftware">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: software</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/software</arg>
|
||||
<arg>-e</arg><arg>software</arg>
|
||||
</spark>
|
||||
<ok to="ExtractORP"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractORP">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: otherresearchproduct</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/otherresearchproduct</arg>
|
||||
<arg>-e</arg><arg>otherresearchproduct</arg>
|
||||
</spark>
|
||||
<ok to="ExtractDatasource"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractDatasource">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: datasource</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/datasource</arg>
|
||||
<arg>-e</arg><arg>datasource</arg>
|
||||
</spark>
|
||||
<ok to="ExtractOrganization"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractOrganization">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: organization</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/organization</arg>
|
||||
<arg>-e</arg><arg>organization</arg>
|
||||
</spark>
|
||||
<ok to="ExtractProject"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractProject">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: project</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/project</arg>
|
||||
<arg>-e</arg><arg>project</arg>
|
||||
</spark>
|
||||
<ok to="ExtractRelation"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractRelation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: relation</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/relation</arg>
|
||||
<arg>-e</arg><arg>relation</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1 @@
|
|||
SELECT source_type, source_id, target_type, target_id, semantics FROM claim WHERE approved=TRUE;
|
|
@ -0,0 +1,90 @@
|
|||
SELECT
|
||||
p.id AS projectid,
|
||||
p.code AS code,
|
||||
p.websiteurl AS websiteurl,
|
||||
p.acronym AS acronym,
|
||||
p.title AS title,
|
||||
p.startdate AS startdate,
|
||||
p.enddate AS enddate,
|
||||
p.call_identifier AS callidentifier,
|
||||
p.keywords AS keywords,
|
||||
p.duration AS duration,
|
||||
p.ec_sc39 AS ecsc39,
|
||||
p.oa_mandate_for_publications AS oamandatepublications,
|
||||
p.ec_article29_3 AS ecarticle29_3,
|
||||
p.dateofcollection AS dateofcollection,
|
||||
p.lastupdate AS dateoftransformation,
|
||||
p.inferred AS inferred,
|
||||
p.deletedbyinference AS deletedbyinference,
|
||||
p.trust AS trust,
|
||||
p.inferenceprovenance AS inferenceprovenance,
|
||||
p.optional1 AS optional1,
|
||||
p.optional2 AS optional2,
|
||||
p.jsonextrainfo AS jsonextrainfo,
|
||||
p.contactfullname AS contactfullname,
|
||||
p.contactfax AS contactfax,
|
||||
p.contactphone AS contactphone,
|
||||
p.contactemail AS contactemail,
|
||||
p.summary AS summary,
|
||||
p.currency AS currency,
|
||||
p.totalcost AS totalcost,
|
||||
p.fundedamount AS fundedamount,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype,
|
||||
pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
|
||||
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
|
||||
array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
|
||||
array_agg(DISTINCT fp.path) AS fundingtree
|
||||
FROM projects p
|
||||
LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
|
||||
LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
|
||||
|
||||
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
|
||||
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
|
||||
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
|
||||
|
||||
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
|
||||
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
|
||||
|
||||
LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
|
||||
LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
|
||||
|
||||
LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
|
||||
LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
|
||||
|
||||
LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass)
|
||||
LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme)
|
||||
|
||||
GROUP BY
|
||||
p.id,
|
||||
p.code,
|
||||
p.websiteurl,
|
||||
p.acronym,
|
||||
p.title,
|
||||
p.startdate,
|
||||
p.enddate,
|
||||
p.call_identifier,
|
||||
p.keywords,
|
||||
p.duration,
|
||||
p.ec_sc39,
|
||||
p.oa_mandate_for_publications,
|
||||
p.ec_article29_3,
|
||||
p.dateofcollection,
|
||||
p.inferred,
|
||||
p.deletedbyinference,
|
||||
p.trust,
|
||||
p.inferenceprovenance,
|
||||
p.contactfullname,
|
||||
p.contactfax,
|
||||
p.contactphone,
|
||||
p.contactemail,
|
||||
p.summary,
|
||||
p.currency,
|
||||
p.totalcost,
|
||||
p.fundedamount,
|
||||
dc.id,
|
||||
dc.officialname,
|
||||
pac.code, pac.name, pas.code, pas.name,
|
||||
ctc.code, ctc.name, cts.code, cts.name;
|
|
@ -0,0 +1,5 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"inputPaths", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,30 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sourceNN</name>
|
||||
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/applicationHistory</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,111 @@
|
|||
<workflow-app xmlns='uri:oozie:workflow:0.5' name='migrate_actions'>
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourceNN</name>
|
||||
<description>the source name node</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>the isLookup service endpoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingDirectory</name>
|
||||
<value>/tmp/actionsets</value>
|
||||
<description>working directory</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>distcp_memory_mb</name>
|
||||
<value>6144</value>
|
||||
<description>memory for distcp copying actionsets from remote cluster</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>distcp_task_timeout</name>
|
||||
<value>60000000</value>
|
||||
<description>timeout for distcp copying actions from remote cluster</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>distcp_num_maps</name>
|
||||
<value>1</value>
|
||||
<description>mmaximum number of map tasks used in the distcp process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>transform_only</name>
|
||||
<description>activate tranform-only mode. Only apply transformation step</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to='migrate_actionsets' />
|
||||
|
||||
<action name='migrate_actionsets'>
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.actions.MigrateActionSet</main-class>
|
||||
<java-opt>-Dmapred.task.timeout=${distcp_task_timeout}</java-opt>
|
||||
<arg>-is</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>-sn</arg><arg>${sourceNN}</arg>
|
||||
<arg>-tn</arg><arg>${nameNode}</arg>
|
||||
<arg>-w</arg><arg>${workingDirectory}</arg>
|
||||
<arg>-nm</arg><arg>${distcp_num_maps}</arg>
|
||||
<arg>-mm</arg><arg>${distcp_memory_mb}</arg>
|
||||
<arg>-tt</arg><arg>${distcp_task_timeout}</arg>
|
||||
<arg>-tr</arg><arg>${transform_only}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="transform_actions" />
|
||||
<error to="fail" />
|
||||
</action>
|
||||
|
||||
<action name="transform_actions">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>transform_actions</name>
|
||||
<class>eu.dnetlib.dhp.migration.actions.TransformActions</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg><arg>yarn</arg>
|
||||
<arg>-is</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--inputPaths</arg><arg>${wf:actionData('migrate_actionsets')['target_paths']}</arg>
|
||||
</spark>
|
||||
<ok to="end"/>
|
||||
<error to="fail"/>
|
||||
</action>
|
||||
|
||||
<kill name="fail">
|
||||
<message>migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<end name="end" />
|
||||
|
||||
</workflow-app>
|
|
@ -15,8 +15,4 @@
|
|||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hdfsUser</name>
|
||||
<value>dnet</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,169 @@
|
|||
<workflow-app name="import Claims as Graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>migrationClaimsPathStep1</name>
|
||||
<description>the base path to store hdfs file</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>migrationClaimsPathStep2</name>
|
||||
<description>the temporary path to store entities before dispatching</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>migrationClaimsPathStep3</name>
|
||||
<description>the graph Raw base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoURL</name>
|
||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoDb</name>
|
||||
<description>mongo database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${migrationClaimsPathStep1}'/>
|
||||
<mkdir path='${migrationClaimsPathStep1}'/>
|
||||
</fs>
|
||||
<ok to="ImportDBClaims"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportDBClaims">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationClaimsPathStep1}/db_claims</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
<arg>-a</arg><arg>claims</arg>
|
||||
</java>
|
||||
<ok to="ImportODFClaims"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODFClaims">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationClaimsPathStep1}/odf_claims</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>claim</arg>
|
||||
</java>
|
||||
<ok to="ImportOAFClaims"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAFClaims">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationClaimsPathStep1}/oaf_claims</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>claim</arg>
|
||||
</java>
|
||||
<ok to="ResetClaimEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResetClaimEntities">
|
||||
<fs>
|
||||
<delete path='${migrationClaimsPathStep2}'/>
|
||||
<mkdir path='${migrationClaimsPathStep2}'/>
|
||||
</fs>
|
||||
<ok to="GenerateClaimEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateClaimEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateClaimEntities</name>
|
||||
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${migrationClaimsPathStep1}/db_claims,${migrationClaimsPathStep1}/oaf_claims,${migrationClaimsPathStep1}/odf_claims</arg>
|
||||
<arg>-t</arg><arg>${migrationClaimsPathStep2}/claim_entities</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</spark>
|
||||
<ok to="ResetClaimGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResetClaimGraph">
|
||||
<fs>
|
||||
<delete path='${migrationClaimsPathStep3}'/>
|
||||
<mkdir path='${migrationClaimsPathStep3}'/>
|
||||
</fs>
|
||||
<ok to="GenerateClaimGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateClaimGraph">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateClaimGraph</name>
|
||||
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${migrationClaimsPathStep2}/claim_entities</arg>
|
||||
<arg>-g</arg><arg>${migrationClaimsPathStep3}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,168 @@
|
|||
<workflow-app name="import regular entities as Graph (all steps)" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>migrationPathStep1</name>
|
||||
<description>the base path to store hdfs file</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>migrationPathStep2</name>
|
||||
<description>the temporary path to store entities before dispatching</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>migrationPathStep3</name>
|
||||
<description>the graph Raw base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoURL</name>
|
||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoDb</name>
|
||||
<description>mongo database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${migrationPathStep1}'/>
|
||||
<mkdir path='${migrationPathStep1}'/>
|
||||
</fs>
|
||||
<ok to="ImportDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportDB">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationPathStep1}/db_records</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ImportODF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODF">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationPathStep1}/odf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="ImportOAF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAF">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationPathStep1}/oaf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="ResetEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResetEntities">
|
||||
<fs>
|
||||
<delete path='${migrationPathStep2}'/>
|
||||
<mkdir path='${migrationPathStep2}'/>
|
||||
</fs>
|
||||
<ok to="GenerateEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEntities</name>
|
||||
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records</arg>
|
||||
<arg>-t</arg><arg>${migrationPathStep2}/all_entities</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</spark>
|
||||
<ok to="ResetGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResetGraph">
|
||||
<fs>
|
||||
<delete path='${migrationPathStep3}'/>
|
||||
<mkdir path='${migrationPathStep3}'/>
|
||||
</fs>
|
||||
<ok to="GenerateGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateGraph">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateGraph</name>
|
||||
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${migrationPathStep2}/all_entities</arg>
|
||||
<arg>-g</arg><arg>${migrationPathStep3}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,103 @@
|
|||
<workflow-app name="import regular entities as Graph (step 1)" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>migrationPathStep1</name>
|
||||
<description>the base path to store hdfs file</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoURL</name>
|
||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoDb</name>
|
||||
<description>mongo database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${migrationPathStep1}'/>
|
||||
<mkdir path='${migrationPathStep1}'/>
|
||||
</fs>
|
||||
<ok to="ImportDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportDB">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationPathStep1}/db_records</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ImportODF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODF">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationPathStep1}/odf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="ImportOAF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAF">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationPathStep1}/oaf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,74 @@
|
|||
<workflow-app name="import regular entities as Graph (step 2)" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>migrationPathStep1</name>
|
||||
<description>the base path to store hdfs file</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>migrationPathStep2</name>
|
||||
<description>the temporary path to store entities before dispatching</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetEntities"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetEntities">
|
||||
<fs>
|
||||
<delete path='${migrationPathStep2}'/>
|
||||
<mkdir path='${migrationPathStep2}'/>
|
||||
</fs>
|
||||
<ok to="GenerateEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEntities</name>
|
||||
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records</arg>
|
||||
<arg>-t</arg><arg>${migrationPathStep2}/all_entities</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,60 @@
|
|||
<workflow-app name="import regular entities as Graph (step 3)" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
|
||||
<property>
|
||||
<name>migrationPathStep2</name>
|
||||
<description>the temporary path to store entities before dispatching</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>migrationPathStep3</name>
|
||||
<description>the graph Raw base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetGraph"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetGraph">
|
||||
<fs>
|
||||
<delete path='${migrationPathStep3}'/>
|
||||
<mkdir path='${migrationPathStep3}'/>
|
||||
</fs>
|
||||
<ok to="GenerateGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateGraph">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateGraph</name>
|
||||
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${migrationPathStep2}/all_entities</arg>
|
||||
<arg>-g</arg><arg>${migrationPathStep3}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,79 +1,87 @@
|
|||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.model.mdstore.Provenance;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.model.mdstore.Provenance;
|
||||
|
||||
public class CollectionJobTest {
|
||||
private Path testDir;
|
||||
|
||||
@Before
|
||||
public void setup() throws IOException {
|
||||
testDir = Files.createTempDirectory("dhp-collection");
|
||||
}
|
||||
private Path testDir;
|
||||
|
||||
@After
|
||||
public void teadDown() throws IOException {
|
||||
FileUtils.deleteDirectory(testDir.toFile());
|
||||
}
|
||||
@Before
|
||||
public void setup() throws IOException {
|
||||
testDir = Files.createTempDirectory("dhp-collection");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tesCollection() throws Exception {
|
||||
Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
|
||||
GenerateNativeStoreSparkJob.main(new String[] {
|
||||
"-mt", "local",
|
||||
"-w", "wid",
|
||||
"-e", "XML",
|
||||
"-d", ""+System.currentTimeMillis(),
|
||||
"-p", new ObjectMapper().writeValueAsString(provenance),
|
||||
"-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']",
|
||||
"-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(),
|
||||
"-o", testDir.toString()+"/store",
|
||||
"-t", "true",
|
||||
"-ru", "",
|
||||
"-rp", "",
|
||||
"-rh", "",
|
||||
"-ro", "",
|
||||
"-rr", ""});
|
||||
System.out.println(new ObjectMapper().writeValueAsString(provenance));
|
||||
}
|
||||
@After
|
||||
public void teadDown() throws IOException {
|
||||
FileUtils.deleteDirectory(testDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tesCollection() throws Exception {
|
||||
final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
|
||||
GenerateNativeStoreSparkJob.main(new String[] {
|
||||
"-mt", "local",
|
||||
"-w", "wid",
|
||||
"-e", "XML",
|
||||
"-d", "" + System.currentTimeMillis(),
|
||||
"-p", new ObjectMapper().writeValueAsString(provenance),
|
||||
"-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']",
|
||||
"-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(),
|
||||
"-o", testDir.toString() + "/store",
|
||||
"-t", "true",
|
||||
"-ru", "",
|
||||
"-rp", "",
|
||||
"-rh", "",
|
||||
"-ro", "",
|
||||
"-rr", "" });
|
||||
System.out.println(new ObjectMapper().writeValueAsString(provenance));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGenerationMetadataRecord() throws Exception {
|
||||
|
||||
@Test
|
||||
public void testGenerationMetadataRecord() throws Exception {
|
||||
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
|
||||
|
||||
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
|
||||
final MetadataRecord record = GenerateNativeStoreSparkJob
|
||||
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
|
||||
"ns_prefix"), System.currentTimeMillis(), null, null);
|
||||
|
||||
MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
|
||||
assert record != null;
|
||||
System.out.println(record.getId());
|
||||
System.out.println(record.getOriginalId());
|
||||
|
||||
assert record != null;
|
||||
System.out.println(record.getId());
|
||||
System.out.println(record.getOriginalId());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestEquals() throws IOException {
|
||||
|
||||
}
|
||||
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
|
||||
final MetadataRecord record = GenerateNativeStoreSparkJob
|
||||
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
|
||||
"ns_prefix"), System.currentTimeMillis(), null, null);
|
||||
final MetadataRecord record1 = GenerateNativeStoreSparkJob
|
||||
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
|
||||
"ns_prefix"), System.currentTimeMillis(), null, null);
|
||||
assert record != null;
|
||||
record.setBody("ciao");
|
||||
assert record1 != null;
|
||||
record1.setBody("mondo");
|
||||
Assert.assertEquals(record, record1);
|
||||
|
||||
|
||||
@Test
|
||||
public void TestEquals () throws IOException {
|
||||
|
||||
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
|
||||
MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
|
||||
MetadataRecord record1 = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
|
||||
assert record != null;
|
||||
record.setBody("ciao");
|
||||
assert record1 != null;
|
||||
record1.setBody("mondo");
|
||||
Assert.assertEquals(record, record1);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,13 +18,13 @@ public class GraphMappingUtils {
|
|||
public final static Map<String, Class> types = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
types.put("datasource", Datasource.class);
|
||||
types.put("organization", Organization.class);
|
||||
types.put("datasource", Datasource.class);
|
||||
types.put("organization", Organization.class);
|
||||
types.put("project", Project.class);
|
||||
types.put("dataset", Dataset.class);
|
||||
types.put("otherresearchproduct", OtherResearchProduct.class);
|
||||
types.put("software", Software.class);
|
||||
types.put("publication", Publication.class);
|
||||
types.put("dataset", Dataset.class);
|
||||
types.put("otherresearchproduct", OtherResearchProduct.class);
|
||||
types.put("software", Software.class);
|
||||
types.put("publication", Publication.class);
|
||||
types.put("relation", Relation.class);
|
||||
}
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.graph;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
|
@ -13,31 +13,40 @@ public class SparkGraphImporterJob {
|
|||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
|
||||
try(SparkSession spark = getSparkSession(parser)) {
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String hiveDbName = parser.get("hive_db_name");
|
||||
|
||||
spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName));
|
||||
spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
|
||||
|
||||
// Read the input file and convert it into RDD of serializable object
|
||||
GraphMappingUtils.types.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name)
|
||||
.map(s -> new ObjectMapper().readValue(s, clazz))
|
||||
.rdd(), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.saveAsTable(hiveDbName + "." + name));
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||
|
||||
return SparkSession
|
||||
.builder()
|
||||
.appName(SparkGraphImporterJob.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.config("hive.metastore.uris", parser.get("hive_metastore_uris"))
|
||||
.config(conf)
|
||||
.enableHiveSupport()
|
||||
.getOrCreate();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String hiveDbName = parser.get("hive_db_name");
|
||||
|
||||
spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
|
||||
|
||||
// Read the input file and convert it into RDD of serializable object
|
||||
GraphMappingUtils.types.forEach((name, clazz) -> {
|
||||
spark.createDataset(sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class)
|
||||
.map(s -> new ObjectMapper().readValue(s._2().toString(), clazz))
|
||||
.rdd(), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.saveAsTable(hiveDbName + "." + name);
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
CREATE view result as
|
||||
select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.publication p
|
||||
union all
|
||||
select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.dataset d
|
||||
union all
|
||||
select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.software s
|
||||
union all
|
||||
select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.otherresearchproduct o;
|
|
@ -37,12 +37,30 @@
|
|||
<name>MapGraphIntoDataFrame</name>
|
||||
<class>eu.dnetlib.dhp.graph.SparkGraphImporterJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--hive_db_name</arg><arg>${hive_db_name}</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
</spark>
|
||||
<ok to="PostProcessing"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="PostProcessing">
|
||||
<hive xmlns="uri:oozie:hive-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<script>/eu/dnetlib/dhp/graph/hive/postprocessing.sql</script>
|
||||
<param>hive_db_name=${hive_db_name}</param>
|
||||
</hive>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
sparkDriverMemory=8G
|
||||
sparkExecutorMemory=8G
|
||||
sparkDriverMemory=10G
|
||||
sparkExecutorMemory=15G
|
||||
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
|
||||
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
|
||||
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
|
||||
outputPath=/tmp/openaire_provision
|
||||
format=TMF
|
||||
batchSize=2000
|
||||
sparkExecutorCoresForJoining=128
|
||||
sparkExecutorCoresForIndexing=64
|
||||
reuseRecords=true
|
||||
reuseRecords=false
|
||||
otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource
|
|
@ -1,31 +1,32 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import eu.dnetlib.dhp.graph.model.*;
|
||||
import eu.dnetlib.dhp.graph.utils.ContextMapper;
|
||||
import eu.dnetlib.dhp.graph.utils.GraphMappingUtils;
|
||||
import eu.dnetlib.dhp.graph.utils.RelationPartitioner;
|
||||
import eu.dnetlib.dhp.graph.utils.XmlRecordFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkContext;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.Map;
|
||||
|
||||
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity;
|
||||
|
||||
|
@ -45,10 +46,12 @@ import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity;
|
|||
* 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S
|
||||
* and E_target = T. Objects in T are heavily pruned by all the unnecessary information
|
||||
*
|
||||
* 4) perform the join as (((T join R) union S) groupby S.id) yield S -> [ <T, R> ]
|
||||
* 4) perform the join as (((T.id join R.target) union S) groupby S.id) yield S -> [ <T, R> ]
|
||||
*/
|
||||
public class GraphJoiner implements Serializable {
|
||||
|
||||
private Map<String, LongAccumulator> accumulators = Maps.newHashMap();
|
||||
|
||||
public static final int MAX_RELS = 100;
|
||||
|
||||
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
||||
|
@ -61,24 +64,30 @@ public class GraphJoiner implements Serializable {
|
|||
|
||||
private String outPath;
|
||||
|
||||
public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String inputPath, String outPath) {
|
||||
private String otherDsTypeId;
|
||||
|
||||
public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String otherDsTypeId, String inputPath, String outPath) {
|
||||
this.spark = spark;
|
||||
this.contextMapper = contextMapper;
|
||||
this.otherDsTypeId = otherDsTypeId;
|
||||
this.inputPath = inputPath;
|
||||
this.outPath = outPath;
|
||||
|
||||
final SparkContext sc = spark.sparkContext();
|
||||
prepareAccumulators(sc);
|
||||
}
|
||||
|
||||
public GraphJoiner adjacencyLists() {
|
||||
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
|
||||
final JavaSparkContext jsc = new JavaSparkContext(getSpark().sparkContext());
|
||||
|
||||
// read each entity
|
||||
JavaPairRDD<String, TypedRow> datasource = readPathEntity(sc, getInputPath(), "datasource");
|
||||
JavaPairRDD<String, TypedRow> organization = readPathEntity(sc, getInputPath(), "organization");
|
||||
JavaPairRDD<String, TypedRow> project = readPathEntity(sc, getInputPath(), "project");
|
||||
JavaPairRDD<String, TypedRow> dataset = readPathEntity(sc, getInputPath(), "dataset");
|
||||
JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(sc, getInputPath(), "otherresearchproduct");
|
||||
JavaPairRDD<String, TypedRow> software = readPathEntity(sc, getInputPath(), "software");
|
||||
JavaPairRDD<String, TypedRow> publication = readPathEntity(sc, getInputPath(), "publication");
|
||||
JavaPairRDD<String, TypedRow> datasource = readPathEntity(jsc, getInputPath(), "datasource");
|
||||
JavaPairRDD<String, TypedRow> organization = readPathEntity(jsc, getInputPath(), "organization");
|
||||
JavaPairRDD<String, TypedRow> project = readPathEntity(jsc, getInputPath(), "project");
|
||||
JavaPairRDD<String, TypedRow> dataset = readPathEntity(jsc, getInputPath(), "dataset");
|
||||
JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(jsc, getInputPath(), "otherresearchproduct");
|
||||
JavaPairRDD<String, TypedRow> software = readPathEntity(jsc, getInputPath(), "software");
|
||||
JavaPairRDD<String, TypedRow> publication = readPathEntity(jsc, getInputPath(), "publication");
|
||||
|
||||
// create the union between all the entities
|
||||
final String entitiesPath = getOutPath() + "/entities";
|
||||
|
@ -93,31 +102,43 @@ public class GraphJoiner implements Serializable {
|
|||
.map(GraphMappingUtils::serialize)
|
||||
.saveAsTextFile(entitiesPath, GzipCodec.class);
|
||||
|
||||
JavaPairRDD<String, EntityRelEntity> entities = sc.textFile(entitiesPath)
|
||||
JavaPairRDD<String, EntityRelEntity> entities = jsc.textFile(entitiesPath)
|
||||
.map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
|
||||
.mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t));
|
||||
|
||||
final String relationPath = getOutPath() + "/relation";
|
||||
// reads the relationships
|
||||
final JavaPairRDD<String, EntityRelEntity> relation = readPathRelation(sc, getInputPath())
|
||||
.filter(r -> !r.getDeleted()) //only consider those that are not virtually deleted
|
||||
final JavaPairRDD<SortableRelationKey, EntityRelEntity> rels = readPathRelation(jsc, getInputPath())
|
||||
.filter(rel -> !rel.getDeleted()) //only consider those that are not virtually deleted
|
||||
.map(p -> new EntityRelEntity().setRelation(p))
|
||||
.mapToPair(p -> new Tuple2<>(p.getRelation().getSourceId(), p))
|
||||
.groupByKey()
|
||||
.mapToPair(p -> new Tuple2<>(SortableRelationKey.from(p), p));
|
||||
rels
|
||||
.groupByKey(new RelationPartitioner(rels.getNumPartitions()))
|
||||
.map(p -> Iterables.limit(p._2(), MAX_RELS))
|
||||
.flatMap(p -> p.iterator())
|
||||
.map(s -> new ObjectMapper().writeValueAsString(s))
|
||||
.saveAsTextFile(relationPath, GzipCodec.class);
|
||||
|
||||
final JavaPairRDD<String, EntityRelEntity> relation = jsc.textFile(relationPath)
|
||||
.map(s -> new ObjectMapper().readValue(s, EntityRelEntity.class))
|
||||
.mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p));
|
||||
|
||||
//final String bySource = getOutPath() + "/1_join_by_target";
|
||||
JavaPairRDD<String, EntityRelEntity> bySource = relation
|
||||
final String bySourcePath = getOutPath() + "/join_by_source";
|
||||
relation
|
||||
.join(entities
|
||||
.filter(e -> !e._2().getSource().getDeleted())
|
||||
.mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2()))))
|
||||
.map(s -> new EntityRelEntity()
|
||||
.setRelation(s._2()._1().getRelation())
|
||||
.setTarget(s._2()._2().getSource()))
|
||||
.map(j -> new ObjectMapper().writeValueAsString(j))
|
||||
.saveAsTextFile(bySourcePath, GzipCodec.class);
|
||||
|
||||
JavaPairRDD<String, EntityRelEntity> bySource = jsc.textFile(bySourcePath)
|
||||
.map(e -> getObjectMapper().readValue(e, EntityRelEntity.class))
|
||||
.mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t));
|
||||
|
||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, false, schemaLocation, new HashSet<>());
|
||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
|
||||
entities
|
||||
.union(bySource)
|
||||
.groupByKey() // by source id
|
||||
|
@ -130,20 +151,6 @@ public class GraphJoiner implements Serializable {
|
|||
return this;
|
||||
}
|
||||
|
||||
public GraphJoiner asXML() {
|
||||
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
|
||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, true, "", new HashSet<>());
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
final String joinedEntitiesPath = getOutPath() + "/1_joined_entities";
|
||||
sc.textFile(joinedEntitiesPath)
|
||||
.map(s -> mapper.readValue(s, JoinedEntity.class))
|
||||
.mapToPair(je -> new Tuple2<>(new Text(je.getEntity().getId()), new Text(recordFactory.build(je))))
|
||||
.saveAsHadoopFile(getOutPath() + "/2_xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public SparkSession getSpark() {
|
||||
return spark;
|
||||
}
|
||||
|
@ -158,24 +165,23 @@ public class GraphJoiner implements Serializable {
|
|||
|
||||
// HELPERS
|
||||
|
||||
private OafEntity parseOaf(final String json, final String type) {
|
||||
final ObjectMapper o = new ObjectMapper();
|
||||
private OafEntity parseOaf(final String json, final String type, final ObjectMapper mapper) {
|
||||
try {
|
||||
switch (GraphMappingUtils.EntityType.valueOf(type)) {
|
||||
case publication:
|
||||
return o.readValue(json, Publication.class);
|
||||
return mapper.readValue(json, Publication.class);
|
||||
case dataset:
|
||||
return o.readValue(json, Dataset.class);
|
||||
return mapper.readValue(json, Dataset.class);
|
||||
case otherresearchproduct:
|
||||
return o.readValue(json, OtherResearchProduct.class);
|
||||
return mapper.readValue(json, OtherResearchProduct.class);
|
||||
case software:
|
||||
return o.readValue(json, Software.class);
|
||||
return mapper.readValue(json, Software.class);
|
||||
case datasource:
|
||||
return o.readValue(json, Datasource.class);
|
||||
return mapper.readValue(json, Datasource.class);
|
||||
case organization:
|
||||
return o.readValue(json, Organization.class);
|
||||
return mapper.readValue(json, Organization.class);
|
||||
case project:
|
||||
return o.readValue(json, Project.class);
|
||||
return mapper.readValue(json, Project.class);
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid type: " + type);
|
||||
}
|
||||
|
@ -185,26 +191,26 @@ public class GraphJoiner implements Serializable {
|
|||
}
|
||||
|
||||
private JoinedEntity toJoinedEntity(Tuple2<String, Iterable<EntityRelEntity>> p) {
|
||||
final ObjectMapper o = new ObjectMapper();
|
||||
final ObjectMapper mapper = getObjectMapper();
|
||||
final JoinedEntity j = new JoinedEntity();
|
||||
final Links links2 = new Links();
|
||||
final Links links = new Links();
|
||||
for(EntityRelEntity rel : p._2()) {
|
||||
if (rel.hasMainEntity() & j.getEntity() == null) {
|
||||
j.setType(rel.getSource().getType());
|
||||
j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType()));
|
||||
j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType(), mapper));
|
||||
}
|
||||
if (rel.hasRelatedEntity()) {
|
||||
try {
|
||||
links2.add(
|
||||
links.add(
|
||||
new eu.dnetlib.dhp.graph.model.Tuple2()
|
||||
.setRelation(o.readValue(rel.getRelation().getOaf(), Relation.class))
|
||||
.setRelatedEntity(o.readValue(rel.getTarget().getOaf(), RelatedEntity.class)));
|
||||
.setRelation(mapper.readValue(rel.getRelation().getOaf(), Relation.class))
|
||||
.setRelatedEntity(mapper.readValue(rel.getTarget().getOaf(), RelatedEntity.class)));
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
j.setLinks(links2);
|
||||
j.setLinks(links);
|
||||
if (j.getEntity() == null) {
|
||||
throw new IllegalStateException("missing main entity on '" + p._1() + "'");
|
||||
}
|
||||
|
@ -250,8 +256,38 @@ public class GraphJoiner implements Serializable {
|
|||
.setTargetId(json.read("$.target"))
|
||||
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
|
||||
.setType("relation")
|
||||
.setRelType("$.relType")
|
||||
.setSubRelType("$.subRelType")
|
||||
.setRelClass("$.relClass")
|
||||
.setOaf(s);
|
||||
});
|
||||
}
|
||||
|
||||
private ObjectMapper getObjectMapper() {
|
||||
return new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
}
|
||||
|
||||
private void prepareAccumulators(SparkContext sc) {
|
||||
accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments"));
|
||||
accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
|
||||
accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
|
||||
accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
|
||||
accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
|
||||
accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
|
||||
|
||||
accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
|
||||
accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
|
||||
accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
|
||||
accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
|
||||
accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf"));
|
||||
|
||||
accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution"));
|
||||
accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant"));
|
||||
accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant"));
|
||||
accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
|
||||
accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
|
||||
accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy"));
|
||||
accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ public class SparkXmlRecordBuilderJob {
|
|||
final String inputPath = parser.get("sourcePath");
|
||||
final String outputPath = parser.get("outputPath");
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
final String otherDsTypeId = parser.get("otherDsTypeId");
|
||||
|
||||
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
||||
if (fs.exists(new Path(outputPath))) {
|
||||
|
@ -31,8 +32,9 @@ public class SparkXmlRecordBuilderJob {
|
|||
fs.mkdirs(new Path(outputPath));
|
||||
}
|
||||
|
||||
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), inputPath, outputPath)
|
||||
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
|
||||
.adjacencyLists();
|
||||
//.asXML();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,99 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Allows to sort relationships according to the priority defined in weights map.
|
||||
*/
|
||||
public class SortableRelationKey implements Comparable<SortableRelationKey>, Serializable {
|
||||
|
||||
private String sourceId;
|
||||
private String targetId;
|
||||
|
||||
private String relType;
|
||||
private String subRelType;
|
||||
private String relClass;
|
||||
|
||||
private final static Map<String, Integer> weights = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
weights.put("outcome", 0);
|
||||
weights.put("supplement", 1);
|
||||
weights.put("publicationDataset", 2);
|
||||
weights.put("relationship", 3);
|
||||
weights.put("similarity", 4);
|
||||
weights.put("affiliation", 5);
|
||||
|
||||
weights.put("provision", 6);
|
||||
weights.put("participation", 7);
|
||||
weights.put("dedup", 8);
|
||||
}
|
||||
|
||||
public static SortableRelationKey from(final EntityRelEntity e) {
|
||||
return new SortableRelationKey()
|
||||
.setSourceId(e.getRelation().getSourceId())
|
||||
.setTargetId(e.getRelation().getTargetId())
|
||||
.setRelType(e.getRelation().getRelType())
|
||||
.setSubRelType(e.getRelation().getSubRelType())
|
||||
.setRelClass(e.getRelation().getRelClass());
|
||||
}
|
||||
|
||||
public String getSourceId() {
|
||||
return sourceId;
|
||||
}
|
||||
|
||||
public SortableRelationKey setSourceId(String sourceId) {
|
||||
this.sourceId = sourceId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getTargetId() {
|
||||
return targetId;
|
||||
}
|
||||
|
||||
public SortableRelationKey setTargetId(String targetId) {
|
||||
this.targetId = targetId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public SortableRelationKey setRelType(String relType) {
|
||||
this.relType = relType;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getSubRelType() {
|
||||
return subRelType;
|
||||
}
|
||||
|
||||
public SortableRelationKey setSubRelType(String subRelType) {
|
||||
this.subRelType = subRelType;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getRelClass() {
|
||||
return relClass;
|
||||
}
|
||||
|
||||
public SortableRelationKey setRelClass(String relClass) {
|
||||
this.relClass = relClass;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(SortableRelationKey o) {
|
||||
return ComparisonChain.start()
|
||||
.compare(weights.get(getSubRelType()), weights.get(o.getSubRelType()))
|
||||
.compare(getSourceId(), o.getSourceId())
|
||||
.compare(getTargetId(), o.getTargetId())
|
||||
.result();
|
||||
}
|
||||
|
||||
}
|
|
@ -12,6 +12,10 @@ public class TypedRow implements Serializable {
|
|||
|
||||
private String type;
|
||||
|
||||
private String relType;
|
||||
private String subRelType;
|
||||
private String relClass;
|
||||
|
||||
private String oaf;
|
||||
|
||||
public String getSourceId() {
|
||||
|
@ -50,6 +54,33 @@ public class TypedRow implements Serializable {
|
|||
return this;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public TypedRow setRelType(String relType) {
|
||||
this.relType = relType;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getSubRelType() {
|
||||
return subRelType;
|
||||
}
|
||||
|
||||
public TypedRow setSubRelType(String subRelType) {
|
||||
this.subRelType = subRelType;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getRelClass() {
|
||||
return relClass;
|
||||
}
|
||||
|
||||
public TypedRow setRelClass(String relClass) {
|
||||
this.relClass = relClass;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getOaf() {
|
||||
return oaf;
|
||||
}
|
||||
|
|
|
@ -26,6 +26,8 @@ import static org.apache.commons.lang3.StringUtils.*;
|
|||
|
||||
public class GraphMappingUtils {
|
||||
|
||||
public static final String SEPARATOR = "_";
|
||||
|
||||
public enum EntityType {
|
||||
publication, dataset, otherresearchproduct, software, datasource, organization, project
|
||||
}
|
||||
|
@ -38,34 +40,6 @@ public class GraphMappingUtils {
|
|||
|
||||
public static Set<String> instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation");
|
||||
|
||||
private static BiMap<String, String> relClassMapping = HashBiMap.create();
|
||||
|
||||
static {
|
||||
relClassMapping.put("isAuthorInstitutionOf", "hasAuthorInstitution");
|
||||
relClassMapping.put("isMergedIn", "merges");
|
||||
relClassMapping.put("isProducedBy", "produces");
|
||||
relClassMapping.put("hasParticipant", "isParticipant");
|
||||
relClassMapping.put("isProvidedBy", "provides");
|
||||
relClassMapping.put("isRelatedTo", "isRelatedTo");
|
||||
relClassMapping.put("isAmongTopNSimilarDocuments", "hasAmongTopNSimilarDocuments");
|
||||
relClassMapping.put("isRelatedTo", "isRelatedTo");
|
||||
relClassMapping.put("isSupplementTo", "isSupplementedBy");
|
||||
}
|
||||
|
||||
public static String getInverseRelClass(final String relClass) {
|
||||
String res = relClassMapping.get(relClass);
|
||||
if (isNotBlank(res)) {
|
||||
return res;
|
||||
}
|
||||
res = relClassMapping.inverse().get(relClass);
|
||||
|
||||
if (isNotBlank(res)) {
|
||||
return res;
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("unable to find an inverse relationship class for term: " + relClass);
|
||||
}
|
||||
|
||||
private static final String schemeTemplate = "dnet:%s_%s_relations";
|
||||
|
||||
private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
||||
|
@ -158,7 +132,7 @@ public class GraphMappingUtils {
|
|||
re.setLegalname(j.read("$.legalname.value"));
|
||||
re.setLegalshortname(j.read("$.legalshortname.value"));
|
||||
re.setCountry(asQualifier(j.read("$.country")));
|
||||
|
||||
re.setWebsiteurl(j.read("$.websiteurl.value"));
|
||||
break;
|
||||
case project:
|
||||
re.setProjectTitle(j.read("$.title.value"));
|
||||
|
@ -250,5 +224,8 @@ public class GraphMappingUtils {
|
|||
return s;
|
||||
}
|
||||
|
||||
public static String getRelDescriptor(String relType, String subRelType, String relClass) {
|
||||
return relType + SEPARATOR + subRelType + SEPARATOR + relClass;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import eu.dnetlib.dhp.graph.model.SortableRelationKey;
|
||||
import org.apache.spark.Partitioner;
|
||||
import org.apache.spark.util.Utils;
|
||||
|
||||
/**
|
||||
* Used in combination with SortableRelationKey, allows to partition the records by source id, therefore
|
||||
* allowing to sort relations sharing the same source id by the ordering defined in SortableRelationKey.
|
||||
*/
|
||||
public class RelationPartitioner extends Partitioner {
|
||||
|
||||
private int numPartitions;
|
||||
|
||||
public RelationPartitioner(int numPartitions) {
|
||||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numPartitions() {
|
||||
return numPartitions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPartition(Object key) {
|
||||
return Utils.nonNegativeMod(((SortableRelationKey) key).getSourceId().hashCode(), numPartitions());
|
||||
}
|
||||
|
||||
}
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.graph.utils;
|
|||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.mycila.xmltool.XMLDoc;
|
||||
import com.mycila.xmltool.XMLTag;
|
||||
|
@ -11,6 +12,8 @@ import eu.dnetlib.dhp.graph.model.RelatedEntity;
|
|||
import eu.dnetlib.dhp.graph.model.Tuple2;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Element;
|
||||
|
@ -27,6 +30,7 @@ import java.io.Serializable;
|
|||
import java.io.StringReader;
|
||||
import java.io.StringWriter;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
@ -37,37 +41,49 @@ import static org.apache.commons.lang3.StringUtils.substringBefore;
|
|||
|
||||
public class XmlRecordFactory implements Serializable {
|
||||
|
||||
private Map<String, LongAccumulator> accumulators;
|
||||
|
||||
private Set<String> specialDatasourceTypes;
|
||||
|
||||
private ContextMapper contextMapper;
|
||||
|
||||
private String schemaLocation;
|
||||
|
||||
private Set<String> contextes = Sets.newHashSet();
|
||||
|
||||
private boolean indent = false;
|
||||
|
||||
public XmlRecordFactory(
|
||||
final ContextMapper contextMapper, final boolean indent,
|
||||
final String schemaLocation, final Set<String> otherDatasourceTypesUForUI) {
|
||||
final String schemaLocation, final String otherDatasourceTypesUForUI) {
|
||||
|
||||
this(Maps.newHashMap(), contextMapper, indent, schemaLocation, otherDatasourceTypesUForUI);
|
||||
}
|
||||
|
||||
public XmlRecordFactory(
|
||||
final Map<String, LongAccumulator> accumulators,
|
||||
final ContextMapper contextMapper, final boolean indent,
|
||||
final String schemaLocation, final String otherDatasourceTypesUForUI) {
|
||||
|
||||
this.accumulators = accumulators;
|
||||
this.contextMapper = contextMapper;
|
||||
this.schemaLocation = schemaLocation;
|
||||
this.specialDatasourceTypes = otherDatasourceTypesUForUI;
|
||||
this.specialDatasourceTypes = Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI));
|
||||
|
||||
this.indent = indent;
|
||||
}
|
||||
|
||||
public String build(final JoinedEntity je) {
|
||||
|
||||
final Set<String> contexts = Sets.newHashSet();
|
||||
|
||||
final OafEntity entity = je.getEntity();
|
||||
TemplateFactory templateFactory = new TemplateFactory();
|
||||
try {
|
||||
final List<String> metadata = metadata(je.getType(), entity);
|
||||
final List<String> metadata = metadata(je.getType(), entity, contexts);
|
||||
|
||||
// rels has to be processed before the contexts because they enrich the contextMap with the funding info.
|
||||
final List<String> relations = listRelations(je, templateFactory);
|
||||
final List<String> relations = listRelations(je, templateFactory, contexts);
|
||||
|
||||
metadata.addAll(buildContexts(getMainType(je.getType())));
|
||||
metadata.addAll(buildContexts(getMainType(je.getType()), contexts));
|
||||
metadata.add(parseDataInfo(entity.getDataInfo()));
|
||||
|
||||
final String body = templateFactory.buildBody(
|
||||
|
@ -97,10 +113,11 @@ public class XmlRecordFactory implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
private List<String> metadata(final String type, final OafEntity entity) {
|
||||
private List<String> metadata(final String type, final OafEntity entity, final Set<String> contexts) {
|
||||
|
||||
final List<String> metadata = Lists.newArrayList();
|
||||
|
||||
|
||||
if (entity.getCollectedfrom() != null) {
|
||||
metadata.addAll(entity.getCollectedfrom()
|
||||
.stream()
|
||||
|
@ -123,6 +140,17 @@ public class XmlRecordFactory implements Serializable {
|
|||
if (GraphMappingUtils.isResult(type)) {
|
||||
final Result r = (Result) entity;
|
||||
|
||||
if (r.getContext() != null) {
|
||||
contexts.addAll(r.getContext()
|
||||
.stream()
|
||||
.map(c -> c.getId())
|
||||
.collect(Collectors.toList()));
|
||||
/* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */
|
||||
if (contexts.contains("dh-ch::subcommunity::2")) {
|
||||
contexts.add("clarin");
|
||||
}
|
||||
}
|
||||
|
||||
if (r.getTitle() != null) {
|
||||
metadata.addAll(r.getTitle()
|
||||
.stream()
|
||||
|
@ -235,16 +263,6 @@ public class XmlRecordFactory implements Serializable {
|
|||
}
|
||||
|
||||
metadata.add(mapQualifier("bestaccessright", getBestAccessright(r)));
|
||||
|
||||
if (r.getContext() != null) {
|
||||
contextes.addAll(r.getContext()
|
||||
.stream()
|
||||
.map(c -> c.getId())
|
||||
.collect(Collectors.toList()));
|
||||
if (contextes.contains("dh-ch::subcommunity::2")) {
|
||||
contextes.add("clarin");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (EntityType.valueOf(type)) {
|
||||
|
@ -445,7 +463,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
if (ds.getSubjects() != null) {
|
||||
metadata.addAll(ds.getSubjects()
|
||||
.stream()
|
||||
.map(sp -> mapStructuredProperty("subject", sp))
|
||||
.map(sp -> mapStructuredProperty("subjects", sp))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
|
@ -580,7 +598,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
if (p.getFundingtree() != null) {
|
||||
metadata.addAll(p.getFundingtree()
|
||||
.stream()
|
||||
.map(ft -> asXmlElement("fundingtree", ft.getValue()))
|
||||
.map(ft -> ft.getValue())
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
|
@ -618,7 +636,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
return bestAccessRight;
|
||||
}
|
||||
|
||||
private List<String> listRelations(final JoinedEntity je, TemplateFactory templateFactory) {
|
||||
private List<String> listRelations(final JoinedEntity je, TemplateFactory templateFactory, final Set<String> contexts) {
|
||||
final List<String> rels = Lists.newArrayList();
|
||||
|
||||
for (final Tuple2 link : je.getLinks()) {
|
||||
|
@ -699,7 +717,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
if (re.getFundingtree() != null) {
|
||||
metadata.addAll(re.getFundingtree()
|
||||
.stream()
|
||||
.peek(ft -> fillContextMap(ft))
|
||||
.peek(ft -> fillContextMap(ft, contexts))
|
||||
.map(ft -> getRelFundingTree(ft))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
@ -709,13 +727,23 @@ public class XmlRecordFactory implements Serializable {
|
|||
|
||||
}
|
||||
final DataInfo info = rel.getDataInfo();
|
||||
final String scheme = getScheme(re.getType(), targetType);
|
||||
|
||||
if (StringUtils.isBlank(scheme)) {
|
||||
throw new IllegalArgumentException(String.format("missing scheme for: <%s - %s>", re.getType(), targetType));
|
||||
}
|
||||
|
||||
final String accumulatorName = getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass());
|
||||
if (accumulators.containsKey(accumulatorName)) {
|
||||
accumulators.get(accumulatorName).add(1);
|
||||
}
|
||||
|
||||
rels.add(templateFactory.getRel(
|
||||
targetType,
|
||||
rel.getTarget(),
|
||||
Sets.newHashSet(metadata),
|
||||
getInverseRelClass(rel.getRelClass()),
|
||||
getScheme(targetType, re.getType()),
|
||||
rel.getRelClass(),
|
||||
scheme,
|
||||
info));
|
||||
}
|
||||
return rels;
|
||||
|
@ -807,14 +835,14 @@ public class XmlRecordFactory implements Serializable {
|
|||
.collect(Collectors.toList()) : Lists.newArrayList();
|
||||
}
|
||||
|
||||
private List<String> buildContexts(final String type) {
|
||||
private List<String> buildContexts(final String type, final Set<String> contexts) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
if ((contextMapper != null) && !contextMapper.isEmpty() && MainEntityType.result.toString().equals(type)) {
|
||||
|
||||
XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot");
|
||||
|
||||
for (final String context : contextes) {
|
||||
for (final String context : contexts) {
|
||||
|
||||
String id = "";
|
||||
for (final String token : Splitter.on("::").split(context)) {
|
||||
|
@ -882,7 +910,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
return buffer.toString();
|
||||
}
|
||||
|
||||
private void fillContextMap(final String xmlTree) {
|
||||
private void fillContextMap(final String xmlTree, final Set<String> contexts) {
|
||||
|
||||
Document fundingPath;
|
||||
try {
|
||||
|
@ -896,7 +924,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
if (funder != null) {
|
||||
|
||||
final String funderShortName = funder.valueOf("./shortname");
|
||||
contextes.add(funderShortName);
|
||||
contexts.add(funderShortName);
|
||||
|
||||
contextMapper.put(funderShortName, new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding"));
|
||||
final Node level0 = fundingPath.selectSingleNode("//funding_level_0");
|
||||
|
@ -905,17 +933,17 @@ public class XmlRecordFactory implements Serializable {
|
|||
contextMapper.put(level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", ""));
|
||||
final Node level1 = fundingPath.selectSingleNode("//funding_level_1");
|
||||
if (level1 == null) {
|
||||
contextes.add(level0Id);
|
||||
contexts.add(level0Id);
|
||||
} else {
|
||||
final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name"));
|
||||
contextMapper.put(level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", ""));
|
||||
final Node level2 = fundingPath.selectSingleNode("//funding_level_2");
|
||||
if (level2 == null) {
|
||||
contextes.add(level1Id);
|
||||
contexts.add(level1Id);
|
||||
} else {
|
||||
final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name"));
|
||||
contextMapper.put(level2Id, new ContextDef(level2Id, level2.valueOf("./description"), "concept", ""));
|
||||
contextes.add(level2Id);
|
||||
contexts.add(level2Id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -928,7 +956,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private String getRelFundingTree(final String xmlTree) {
|
||||
protected static String getRelFundingTree(final String xmlTree) {
|
||||
String funding = "<funding>";
|
||||
try {
|
||||
final Document ftree = new SAXReader().read(new StringReader(xmlTree));
|
||||
|
@ -949,11 +977,11 @@ public class XmlRecordFactory implements Serializable {
|
|||
return funding;
|
||||
}
|
||||
|
||||
private String getFunderElement(final Document ftree) {
|
||||
final String funderId = ftree.valueOf("//fundingtree/funder/id/text()");
|
||||
final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname/text()");
|
||||
final String funderName = ftree.valueOf("//fundingtree/funder/name/text()");
|
||||
final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction/text()");
|
||||
private static String getFunderElement(final Document ftree) {
|
||||
final String funderId = ftree.valueOf("//fundingtree/funder/id");
|
||||
final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname");
|
||||
final String funderName = ftree.valueOf("//fundingtree/funder/name");
|
||||
final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction");
|
||||
|
||||
return "<funder id=\"" + escapeXml(funderId) + "\" shortname=\"" + escapeXml(funderShortName) + "\" name=\"" + escapeXml(funderName)
|
||||
+ "\" jurisdiction=\"" + escapeXml(funderJurisdiction) + "\" />";
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true}
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"otherDsTypeId", "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true}
|
||||
]
|
|
@ -50,9 +50,10 @@
|
|||
<class>eu.dnetlib.dhp.graph.SparkXmlRecordBuilderJob</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--executor-cores ${sparkExecutorCoresForJoining}
|
||||
--executor-memory ${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForJoining}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -60,6 +61,7 @@
|
|||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn</arg>
|
||||
<arg>-is</arg> <arg>${isLookupUrl}</arg>
|
||||
<arg>-t</arg> <arg>${otherDsTypeId}</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
|
@ -77,8 +79,9 @@
|
|||
<class>eu.dnetlib.dhp.graph.SparkXmlIndexingJob</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--executor-cores ${sparkExecutorCoresForIndexing}
|
||||
--executor-memory ${sparkExecutorMemoryForIndexing}
|
||||
--driver-memory=${sparkDriverMemoryForIndexing}
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import org.junit.Before;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class GraphJoinerTest {
|
||||
|
||||
private ClassLoader cl = getClass().getClassLoader();
|
||||
private Path workingDir;
|
||||
private Path inputDir;
|
||||
private Path outputDir;
|
||||
|
||||
@Before
|
||||
public void before() throws IOException {
|
||||
workingDir = Files.createTempDirectory("promote_action_set");
|
||||
inputDir = workingDir.resolve("input");
|
||||
outputDir = workingDir.resolve("output");
|
||||
}
|
||||
|
||||
private static void copyFiles(Path source, Path target) throws IOException {
|
||||
Files.list(source).forEach(f -> {
|
||||
try {
|
||||
if (Files.isDirectory(f)) {
|
||||
Path subTarget = Files.createDirectories(target.resolve(f.getFileName()));
|
||||
copyFiles(f, subTarget);
|
||||
} else {
|
||||
Files.copy(f, target.resolve(f.getFileName()));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
19
pom.xml
19
pom.xml
|
@ -76,7 +76,7 @@
|
|||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.12</version>
|
||||
<version>${junit.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
|
@ -110,6 +110,12 @@
|
|||
<version>${dhp.hadoop.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-distcp</artifactId>
|
||||
<version>${dhp.hadoop.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
|
@ -262,6 +268,16 @@
|
|||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-common</artifactId>
|
||||
<version>6.0.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaire-data-protos</artifactId>
|
||||
<version>3.9.8-proto250</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-pace-core</artifactId>
|
||||
|
@ -481,6 +497,7 @@
|
|||
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||
<scala.version>2.11.12</scala.version>
|
||||
<junit.version>4.12</junit.version>
|
||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||
</properties>
|
||||
</project>
|
||||
|
|
Loading…
Reference in New Issue