Merge remote-tracking branch 'origin/master' into przemyslawjacewicz_actionmanager_impl_prototype

This commit is contained in:
przemek 2020-03-19 15:12:56 +01:00
commit 638b78f96a
63 changed files with 3135 additions and 868 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 689 KiB

View File

@ -31,6 +31,11 @@
<artifactId>jackson-databind</artifactId> <artifactId>jackson-databind</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency> <dependency>
<groupId>junit</groupId> <groupId>junit</groupId>
<artifactId>junit</artifactId> <artifactId>junit</artifactId>

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.schema.action;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import java.io.Serializable;
@JsonDeserialize(using = AtomicActionDeserializer.class)
public class AtomicAction<T extends Oaf> implements Serializable {
private Class<T> clazz;
private T payload;
public AtomicAction() {
}
public AtomicAction(Class<T> clazz, T payload) {
this.clazz = clazz;
this.payload = payload;
}
public Class<T> getClazz() {
return clazz;
}
public void setClazz(Class<T> clazz) {
this.clazz = clazz;
}
public T getPayload() {
return payload;
}
public void setPayload(T payload) {
this.payload = payload;
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.schema.action;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import java.io.IOException;
public class AtomicActionDeserializer extends JsonDeserializer {
@Override
public Object deserialize(JsonParser jp, DeserializationContext ctxt) throws IOException, JsonProcessingException {
JsonNode node = jp.getCodec().readTree(jp);
String classTag = node.get("clazz").asText();
JsonNode payload = node.get("payload");
ObjectMapper mapper = new ObjectMapper();
try {
final Class<?> clazz = Class.forName(classTag);
return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz));
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
}

View File

@ -6,7 +6,7 @@ import java.util.Objects;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.junit.Assert; import static com.google.common.base.Preconditions.checkArgument;
public class Relation extends Oaf { public class Relation extends Oaf {
@ -71,11 +71,13 @@ public class Relation extends Oaf {
} }
public void mergeFrom(final Relation r) { public void mergeFrom(final Relation r) {
Assert.assertEquals("source ids must be equal", getSource(), r.getSource());
Assert.assertEquals("target ids must be equal", getTarget(), r.getTarget()); checkArgument(Objects.equals(getSource(), r.getSource()),"source ids must be equal");
Assert.assertEquals("relType(s) must be equal", getRelType(), r.getRelType()); checkArgument(Objects.equals(getTarget(), r.getTarget()),"target ids must be equal");
Assert.assertEquals("subRelType(s) must be equal", getSubRelType(), r.getSubRelType()); checkArgument(Objects.equals(getRelType(), r.getRelType()),"relType(s) must be equal");
Assert.assertEquals("relClass(es) must be equal", getRelClass(), r.getRelClass()); checkArgument(Objects.equals(getSubRelType(), r.getSubRelType()),"subRelType(s) must be equal");
checkArgument(Objects.equals(getRelClass(), r.getRelClass()),"relClass(es) must be equal");
setCollectedFrom(Stream.concat(getCollectedFrom().stream(), r.getCollectedFrom().stream()) setCollectedFrom(Stream.concat(getCollectedFrom().stream(), r.getCollectedFrom().stream())
.distinct() // relies on KeyValue.equals .distinct() // relies on KeyValue.equals
.collect(Collectors.toList())); .collect(Collectors.toList()));
@ -85,18 +87,18 @@ public class Relation extends Oaf {
public boolean equals(Object o) { public boolean equals(Object o) {
if (this == o) return true; if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false; if (o == null || getClass() != o.getClass()) return false;
if (!super.equals(o)) return false;
Relation relation = (Relation) o; Relation relation = (Relation) o;
return Objects.equals(relType, relation.relType) && return relType.equals(relation.relType) &&
Objects.equals(subRelType, relation.subRelType) && subRelType.equals(relation.subRelType) &&
Objects.equals(relClass, relation.relClass) && relClass.equals(relation.relClass) &&
Objects.equals(source, relation.source) && source.equals(relation.source) &&
Objects.equals(target, relation.target) && target.equals(relation.target) &&
Objects.equals(collectedFrom, relation.collectedFrom); Objects.equals(collectedFrom, relation.collectedFrom);
} }
@Override @Override
public int hashCode() { public int hashCode() {
return Objects.hash(super.hashCode(), relType, subRelType, relClass, source, target, collectedFrom); return Objects.hash(relType, subRelType, relClass, source, target, collectedFrom);
} }
} }

View File

@ -5,7 +5,7 @@ import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
public abstract class Result extends OafEntity implements Serializable { public class Result extends OafEntity implements Serializable {
private List<Author> author; private List<Author> author;

View File

@ -0,0 +1,37 @@
package eu.dnetlib.dhp.schema.action;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.lang3.StringUtils;
import org.junit.Assert;
import org.junit.Test;
import java.io.IOException;
public class AtomicActionTest {
@Test
public void serializationTest() throws IOException {
Relation rel = new Relation();
rel.setSource("1");
rel.setTarget("2");
rel.setRelType("resultResult");
rel.setSubRelType("dedup");
rel.setRelClass("merges");
AtomicAction aa1 = new AtomicAction(Relation.class, rel);
final ObjectMapper mapper = new ObjectMapper();
String json = mapper.writeValueAsString(aa1);
Assert.assertTrue(StringUtils.isNotBlank(json));
AtomicAction aa2 = mapper.readValue(json, AtomicAction.class);
Assert.assertEquals(aa1.getClazz(), aa2.getClazz());
Assert.assertEquals(aa1.getPayload(), aa2.getPayload());
}
}

View File

@ -24,6 +24,12 @@
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId> <artifactId>dhp-common</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>com.sun.xml.bind</groupId>
<artifactId>jaxb-core</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>
@ -32,6 +38,49 @@
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon-dom</artifactId>
</exclusion>
<exclusion>
<groupId>jgrapht</groupId>
<artifactId>jgrapht</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
</exclusion>
<exclusion>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.*</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>apache</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-data-protos</artifactId>
</dependency>
<dependency> <dependency>
<groupId>net.sf.saxon</groupId> <groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId> <artifactId>Saxon-HE</artifactId>
@ -56,6 +105,11 @@
<artifactId>mongo-java-driver</artifactId> <artifactId>mongo-java-driver</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-distcp</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.postgresql</groupId> <groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId> <artifactId>postgresql</artifactId>

View File

@ -1,94 +0,0 @@
package eu.dnetlib.dhp.migration;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import scala.Tuple2;
public class ExtractEntitiesFromHDFSJob {
private static final Log log = LogFactory.getLog(ExtractEntitiesFromHDFSJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(ExtractEntitiesFromHDFSJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
try (final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
final List<String> sourcePaths = Arrays.stream(parser.get("sourcePaths").split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList());
final String targetPath = parser.get("graphRawPath");
processEntity(sc, Publication.class, sourcePaths, targetPath);
processEntity(sc, Dataset.class, sourcePaths, targetPath);
processEntity(sc, Software.class, sourcePaths, targetPath);
processEntity(sc, OtherResearchProduct.class, sourcePaths, targetPath);
processEntity(sc, Datasource.class, sourcePaths, targetPath);
processEntity(sc, Organization.class, sourcePaths, targetPath);
processEntity(sc, Project.class, sourcePaths, targetPath);
processEntity(sc, Relation.class, sourcePaths, targetPath);
}
}
private static void processEntity(final JavaSparkContext sc, final Class<?> clazz, final List<String> sourcePaths, final String targetPath) {
final String type = clazz.getSimpleName().toLowerCase();
log.info(String.format("Processing entities (%s) in files:", type));
sourcePaths.forEach(log::info);
JavaRDD<String> inputRdd = sc.emptyRDD();
for (final String sp : sourcePaths) {
inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
.filter(k -> isEntityType(k._1(), type))
.map(Tuple2::_2));
}
inputRdd.saveAsTextFile(targetPath + "/" + type);
}
private static boolean isEntityType(final String item, final String type) {
return StringUtils.substringAfter(item, ":").equalsIgnoreCase(type);
}
private static boolean exists(final JavaSparkContext context, final String pathToFile) {
try {
final FileSystem hdfs = org.apache.hadoop.fs.FileSystem.get(context.hadoopConfiguration());
final Path path = new Path(pathToFile);
return hdfs.exists(path);
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -1,45 +0,0 @@
package eu.dnetlib.dhp.migration;
import org.apache.commons.io.IOUtils;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class MigrateMongoMdstoresApplication {
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
parser.parseArgument(args);
final String mongoBaseUrl = parser.get("mongoBaseUrl");
final String mongoDb = parser.get("mongoDb");
final String mdFormat = parser.get("mdFormat");
final String mdLayout = parser.get("mdLayout");
final String mdInterpretation = parser.get("mdInterpretation");
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("namenode");
final String hdfsUser = parser.get("hdfsUser");
final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword");
if (mdFormat.equalsIgnoreCase("oaf")) {
try (final OafMigrationExecutor mig =
new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
}
} else if (mdFormat.equalsIgnoreCase("odf")) {
try (final OdfMigrationExecutor mig =
new OdfMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
}
} else {
throw new RuntimeException("Format not supported: " + mdFormat);
}
}
}

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.migration.actions;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
import java.util.Comparator;
public class LicenseComparator implements Comparator<Qualifier> {
@Override
public int compare(Qualifier left, Qualifier right) {
if (left == null && right == null) return 0;
if (left == null) return 1;
if (right == null) return -1;
String lClass = left.getClassid();
String rClass = right.getClassid();
if (lClass.equals(rClass)) return 0;
if (lClass.equals("OPEN SOURCE")) return -1;
if (rClass.equals("OPEN SOURCE")) return 1;
if (lClass.equals("OPEN")) return -1;
if (rClass.equals("OPEN")) return 1;
if (lClass.equals("6MONTHS")) return -1;
if (rClass.equals("6MONTHS")) return 1;
if (lClass.equals("12MONTHS")) return -1;
if (rClass.equals("12MONTHS")) return 1;
if (lClass.equals("EMBARGO")) return -1;
if (rClass.equals("EMBARGO")) return 1;
if (lClass.equals("RESTRICTED")) return -1;
if (rClass.equals("RESTRICTED")) return 1;
if (lClass.equals("CLOSED")) return -1;
if (rClass.equals("CLOSED")) return 1;
if (lClass.equals("UNKNOWN")) return -1;
if (rClass.equals("UNKNOWN")) return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

View File

@ -0,0 +1,170 @@
package eu.dnetlib.dhp.migration.actions;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.DistCp;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.util.ToolRunner;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.*;
import java.util.stream.Collectors;
public class MigrateActionSet {
private static final Log log = LogFactory.getLog(MigrateActionSet.class);
private static final String SEPARATOR = "/";
private static final String TARGET_PATHS = "target_paths";
private static final String RAWSET_PREFIX = "rawset_";
private static Boolean DEFAULT_TRANSFORM_ONLY = false;
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateActionSet.class.getResourceAsStream(
"/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json")));
parser.parseArgument(args);
new MigrateActionSet().run(parser);
}
private void run(ArgumentApplicationParser parser) throws Exception {
final String isLookupUrl = parser.get("isLookupUrl");
final String sourceNN = parser.get("sourceNameNode");
final String targetNN = parser.get("targetNameNode");
final String workDir = parser.get("workingDirectory");
final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps"));
final String distcp_memory_mb = parser.get("distcp_memory_mb");
final String distcp_task_timeout = parser.get("distcp_task_timeout");
final String transform_only_s = parser.get("transform_only");
log.info("transform only param: " + transform_only_s);
final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only"));
log.info("transform only: " + transformOnly);
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
FileSystem targetFS = FileSystem.get(conf);
Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN);
FileSystem sourceFS = FileSystem.get(sourceConf);
Properties props = new Properties();
List<Path> targetPaths = new ArrayList<>();
final List<Path> sourcePaths = getSourcePaths(sourceNN, isLookUp);
log.info(String.format("paths to process:\n%s", sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n"))));
for(Path source : sourcePaths) {
if (!sourceFS.exists(source)) {
log.warn(String.format("skipping unexisting path: %s", source));
} else {
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath()));
final String rawSet = pathQ.pollLast();
log.info(String.format("got RAWSET: %s", rawSet));
if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) {
final String actionSetDirectory = pathQ.pollLast();
final Path targetPath = new Path(targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet);
log.info(String.format("using TARGET PATH: %s", targetPath));
if (!transformOnly) {
if (targetFS.exists(targetPath)) {
targetFS.delete(targetPath, true);
}
runDistcp(distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath);
}
targetPaths.add(targetPath);
}
}
}
props.setProperty(TARGET_PATHS, targetPaths
.stream()
.map(p -> p.toString())
.collect(Collectors.joining(",")));
File file = new File(System.getProperty("oozie.action.output.properties"));
try(OutputStream os = new FileOutputStream(file)) {
props.store(os, "");
}
System.out.println(file.getAbsolutePath());
}
private void runDistcp(Integer distcp_num_maps, String distcp_memory_mb, String distcp_task_timeout, Configuration conf, Path source, Path targetPath) throws Exception {
final DistCpOptions op = new DistCpOptions(source, targetPath);
op.setMaxMaps(distcp_num_maps);
op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE);
op.preserve(DistCpOptions.FileAttribute.REPLICATION);
op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE);
int res = ToolRunner.run(new DistCp(conf, op), new String[]{
"-Dmapred.task.timeout=" + distcp_task_timeout,
"-Dmapreduce.map.memory.mb=" + distcp_memory_mb,
"-pb",
"-m " + distcp_num_maps,
source.toString(),
targetPath.toString()});
if (res != 0) {
throw new RuntimeException(String.format("distcp exited with code %s", res));
}
}
private Configuration getConfiguration(String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) {
final Configuration conf = new Configuration();
conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout);
conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout);
conf.set("dfs.http.client.retry.policy.enabled", "true");
conf.set("mapred.task.timeout", distcp_task_timeout);
conf.set("mapreduce.map.memory.mb", distcp_memory_mb);
conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps));
return conf;
}
private List<Path> getSourcePaths(String sourceNN, ISLookUpService isLookUp) throws ISLookUpException {
String XQUERY = "distinct-values(\n" +
"let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" +
"for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" +
"let $setDir := $x//SET/@directory/string()\n" +
"let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" +
"return concat($basePath, '/', $setDir, '/', $rawSet))";
log.info(String.format("running xquery:\n%s", XQUERY));
return isLookUp.quickSearchProfile(XQUERY)
.stream()
.map(p -> sourceNN + p)
.map(Path::new)
.collect(Collectors.toList());
}
}

View File

@ -0,0 +1,580 @@
package eu.dnetlib.dhp.migration.actions;
import com.google.common.collect.Lists;
import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.data.proto.*;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
public class ProtoConverter implements Serializable {
public static final String UNKNOWN = "UNKNOWN";
public static final String NOT_AVAILABLE = "not available";
public static final String DNET_ACCESS_MODES = "dnet:access_modes";
public static Oaf convert(OafProtos.Oaf oaf) {
try {
switch (oaf.getKind()) {
case entity:
return convertEntity(oaf);
case relation:
return convertRelation(oaf);
default:
throw new IllegalArgumentException("invalid kind " + oaf.getKind());
}
} catch (Throwable e) {
throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e);
}
}
private static Relation convertRelation(OafProtos.Oaf oaf) {
final OafProtos.OafRel r = oaf.getRel();
final Relation rel = new Relation();
rel.setDataInfo(mapDataInfo(oaf.getDataInfo()));
rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp());
rel.setSource(r.getSource());
rel.setTarget(r.getTarget());
rel.setRelType(r.getRelType().toString());
rel.setSubRelType(r.getSubRelType().toString());
rel.setRelClass(r.getRelClass());
rel.setCollectedFrom(r.getCollectedfromCount() > 0 ?
r.getCollectedfromList().stream()
.map(kv -> mapKV(kv))
.collect(Collectors.toList()) : null);
return rel;
}
private static OafEntity convertEntity(OafProtos.Oaf oaf) {
switch (oaf.getEntity().getType()) {
case result:
final Result r = convertResult(oaf);
r.setInstance(convertInstances(oaf));
return r;
case project:
return convertProject(oaf);
case datasource:
return convertDataSource(oaf);
case organization:
return convertOrganization(oaf);
default:
throw new RuntimeException("received unknown type");
}
}
private static List<Instance> convertInstances(OafProtos.Oaf oaf) {
final ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getInstanceCount() > 0) {
return r.getInstanceList()
.stream()
.map(i -> convertInstance(i))
.collect(Collectors.toList());
}
return Lists.newArrayList();
}
private static Instance convertInstance(ResultProtos.Result.Instance ri) {
final Instance i = new Instance();
i.setAccessright(mapQualifier(ri.getAccessright()));
i.setCollectedfrom(mapKV(ri.getCollectedfrom()));
i.setDateofacceptance(mapStringField(ri.getDateofacceptance()));
i.setDistributionlocation(ri.getDistributionlocation());
i.setHostedby(mapKV(ri.getHostedby()));
i.setInstancetype(mapQualifier(ri.getInstancetype()));
i.setLicense(mapStringField(ri.getLicense()));
i.setUrl(ri.getUrlList());
i.setRefereed(mapStringField(ri.getRefereed()));
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
return i;
}
private static Organization convertOrganization(OafProtos.Oaf oaf) {
final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata();
final Organization org = setOaf(new Organization(), oaf);
setEntity(org, oaf);
org.setLegalshortname(mapStringField(m.getLegalshortname()));
org.setLegalname(mapStringField(m.getLegalname()));
org.setAlternativeNames(m.getAlternativeNamesList().
stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
org.setWebsiteurl(mapStringField(m.getWebsiteurl()));
org.setLogourl(mapStringField(m.getLogourl()));
org.setEclegalbody(mapStringField(m.getEclegalbody()));
org.setEclegalperson(mapStringField(m.getEclegalperson()));
org.setEcnonprofit(mapStringField(m.getEcnonprofit()));
org.setEcresearchorganization(mapStringField(m.getEcresearchorganization()));
org.setEchighereducation(mapStringField(m.getEchighereducation()));
org.setEcinternationalorganizationeurinterests(mapStringField(m.getEcinternationalorganizationeurinterests()));
org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization()));
org.setEcenterprise(mapStringField(m.getEcenterprise()));
org.setEcsmevalidated(mapStringField(m.getEcsmevalidated()));
org.setEcnutscode(mapStringField(m.getEcnutscode()));
org.setCountry(mapQualifier(m.getCountry()));
return org;
}
private static Datasource convertDataSource(OafProtos.Oaf oaf) {
final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata();
final Datasource datasource = setOaf(new Datasource(), oaf);
setEntity(datasource, oaf);
datasource.setAccessinfopackage(m.getAccessinfopackageList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setCertificates(mapStringField(m.getCertificates()));
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
datasource.setContactemail(mapStringField(m.getContactemail()));
datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction()));
datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype()));
datasource.setDataprovider(mapBoolField(m.getDataprovider()));
datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype()));
datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction()));
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
datasource.setDatauploadtype(mapStringField(m.getDatauploadtype()));
datasource.setDateofvalidation(mapStringField(m.getDateofvalidation()));
datasource.setDescription(mapStringField(m.getDescription()));
datasource.setEnglishname(mapStringField(m.getEnglishname()));
datasource.setLatitude(mapStringField(m.getLatitude()));
datasource.setLongitude(mapStringField(m.getLongitude()));
datasource.setLogourl(mapStringField(m.getLogourl()));
datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl()));
datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix()));
datasource.setOdcontenttypes(m.getOdcontenttypesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setOdlanguages(m.getOdlanguagesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems()));
datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate()));
datasource.setOdpolicies(mapStringField(m.getOdpolicies()));
datasource.setOfficialname(mapStringField(m.getOfficialname()));
datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility()));
datasource.setPidsystems(mapStringField(m.getPidsystems()));
datasource.setPolicies(m.getPoliciesList()
.stream()
.map(ProtoConverter::mapKV)
.collect(Collectors.toList()));
datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind()));
datasource.setReleaseenddate(mapStringField(m.getReleaseenddate()));
datasource.setServiceprovider(mapBoolField(m.getServiceprovider()));
datasource.setReleasestartdate(mapStringField(m.getReleasestartdate()));
datasource.setSubjects(m.getSubjectsList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
datasource.setVersioning(mapBoolField(m.getVersioning()));
datasource.setWebsiteurl(mapStringField(m.getWebsiteurl()));
datasource.setJournal(mapJournal(m.getJournal()));
return datasource;
}
private static Project convertProject(OafProtos.Oaf oaf) {
final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata();
final Project project = setOaf(new Project(), oaf);
setEntity(project, oaf);
project.setAcronym(mapStringField(m.getAcronym()));
project.setCallidentifier(mapStringField(m.getCallidentifier()));
project.setCode(mapStringField(m.getCode()));
project.setContactemail(mapStringField(m.getContactemail()));
project.setContactfax(mapStringField(m.getContactfax()));
project.setContactfullname(mapStringField(m.getContactfullname()));
project.setContactphone(mapStringField(m.getContactphone()));
project.setContracttype(mapQualifier(m.getContracttype()));
project.setCurrency(mapStringField(m.getCurrency()));
project.setDuration(mapStringField(m.getDuration()));
project.setEcarticle29_3(mapStringField(m.getEcarticle293()));
project.setEcsc39(mapStringField(m.getEcsc39()));
project.setOamandatepublications(mapStringField(m.getOamandatepublications()));
project.setStartdate(mapStringField(m.getStartdate()));
project.setEnddate(mapStringField(m.getEnddate()));
project.setFundedamount(m.getFundedamount());
project.setTotalcost(m.getTotalcost());
project.setKeywords(mapStringField(m.getKeywords()));
project.setSubjects(m.getSubjectsList().stream()
.map(sp -> mapStructuredProperty(sp))
.collect(Collectors.toList()));
project.setTitle(mapStringField(m.getTitle()));
project.setWebsiteurl(mapStringField(m.getWebsiteurl()));
project.setFundingtree(m.getFundingtreeList().stream()
.map(f -> mapStringField(f))
.collect(Collectors.toList()));
project.setJsonextrainfo(mapStringField(m.getJsonextrainfo()));
project.setSummary(mapStringField(m.getSummary()));
project.setOptional1(mapStringField(m.getOptional1()));
project.setOptional2(mapStringField(m.getOptional2()));
return project;
}
private static Result convertResult(OafProtos.Oaf oaf) {
switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) {
case "dataset":
return createDataset(oaf);
case "publication":
return createPublication(oaf);
case "software":
return createSoftware(oaf);
case "other":
return createORP(oaf);
default:
Result result = setOaf(new Result(), oaf);
setEntity(result, oaf);
return setResult(result, oaf);
}
}
private static Software createSoftware(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Software software = setOaf(new Software(), oaf);
setEntity(software, oaf);
setResult(software, oaf);
software.setDocumentationUrl(m.getDocumentationUrlList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
software.setLicense(m.getLicenseList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl()));
software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage()));
return software;
}
private static OtherResearchProduct createORP(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf);
setEntity(otherResearchProducts, oaf);
setResult(otherResearchProducts, oaf);
otherResearchProducts.setContactperson(m.getContactpersonList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
otherResearchProducts.setContactgroup(m.getContactgroupList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
otherResearchProducts.setTool(m.getToolList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
return otherResearchProducts;
}
private static Publication createPublication(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Publication publication = setOaf(new Publication(), oaf);
setEntity(publication, oaf);
setResult(publication, oaf);
publication.setJournal(mapJournal(m.getJournal()));
return publication;
}
private static Dataset createDataset(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Dataset dataset = setOaf(new Dataset(), oaf);
setEntity(dataset, oaf);
setResult(dataset, oaf);
dataset.setStoragedate(mapStringField(m.getStoragedate()));
dataset.setDevice(mapStringField(m.getDevice()));
dataset.setSize(mapStringField(m.getSize()));
dataset.setVersion(mapStringField(m.getVersion()));
dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate()));
dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber()));
dataset.setGeolocation(m.getGeolocationList()
.stream()
.map(ProtoConverter::mapGeolocation)
.collect(Collectors.toList()));
return dataset;
}
public static <T extends Oaf> T setOaf(T oaf, OafProtos.Oaf o) {
oaf.setDataInfo(mapDataInfo(o.getDataInfo()));
oaf.setLastupdatetimestamp(o.getLastupdatetimestamp());
return oaf;
}
public static <T extends OafEntity> T setEntity(T entity, OafProtos.Oaf oaf) {
//setting Entity fields
final OafProtos.OafEntity e = oaf.getEntity();
entity.setId(e.getId());
entity.setOriginalId(e.getOriginalIdList());
entity.setCollectedfrom(e.getCollectedfromList()
.stream()
.map(ProtoConverter::mapKV)
.collect(Collectors.toList()));
entity.setPid(e.getPidList().stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setDateofcollection(e.getDateofcollection());
entity.setDateoftransformation(e.getDateoftransformation());
entity.setExtraInfo(e.getExtraInfoList()
.stream()
.map(ProtoConverter::mapExtraInfo)
.collect(Collectors.toList()));
return entity;
}
public static <T extends Result> T setResult(T entity, OafProtos.Oaf oaf) {
//setting Entity fields
final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
entity.setAuthor(m.getAuthorList()
.stream()
.map(ProtoConverter::mapAuthor)
.collect(Collectors.toList()));
entity.setResulttype(mapQualifier(m.getResulttype()));
entity.setLanguage(mapQualifier(m.getLanguage()));
entity.setCountry(m.getCountryList()
.stream()
.map(ProtoConverter::mapQualifierAsCountry)
.collect(Collectors.toList()));
entity.setSubject(m.getSubjectList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setTitle(m.getTitleList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setRelevantdate(m.getRelevantdateList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setDescription(m.getDescriptionList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setDateofacceptance(mapStringField(m.getDateofacceptance()));
entity.setPublisher(mapStringField(m.getPublisher()));
entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate()));
entity.setSource(m.getSourceList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setFulltext(m.getFulltextList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setFormat(m.getFormatList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setContributor(m.getContributorList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setResourcetype(mapQualifier(m.getResourcetype()));
entity.setCoverage(m.getCoverageList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setContext(m.getContextList()
.stream()
.map(ProtoConverter::mapContext)
.collect(Collectors.toList()));
entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList()));
return entity;
}
private static Qualifier getBestAccessRights(List<ResultProtos.Result.Instance> instanceList) {
if (instanceList != null) {
final Optional<FieldTypeProtos.Qualifier> min = instanceList.stream()
.map(i -> i.getAccessright()).min(new LicenseComparator());
final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname()) || UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
private static Context mapContext(ResultProtos.Result.Context context) {
final Context entity = new Context();
entity.setId(context.getId());
entity.setDataInfo(context.getDataInfoList()
.stream()
.map(ProtoConverter::mapDataInfo)
.collect(Collectors.toList()));
return entity;
}
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
final KeyValue keyValue = new KeyValue();
keyValue.setKey(kv.getKey());
keyValue.setValue(kv.getValue());
keyValue.setDataInfo(mapDataInfo(kv.getDataInfo()));
return keyValue;
}
public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) {
final DataInfo dataInfo = new DataInfo();
dataInfo.setDeletedbyinference(d.getDeletedbyinference());
dataInfo.setInferenceprovenance(d.getInferenceprovenance());
dataInfo.setInferred(d.getInferred());
dataInfo.setInvisible(d.getInvisible());
dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction()));
dataInfo.setTrust(d.getTrust());
return dataInfo;
}
public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) {
final Qualifier qualifier = new Qualifier();
qualifier.setClassid(q.getClassid());
qualifier.setClassname(q.getClassname());
qualifier.setSchemeid(q.getSchemeid());
qualifier.setSchemename(q.getSchemename());
return qualifier;
}
public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) {
final Country c = new Country();
c.setClassid(q.getClassid());
c.setClassname(q.getClassname());
c.setSchemeid(q.getSchemeid());
c.setSchemename(q.getSchemename());
c.setDataInfo(mapDataInfo(q.getDataInfo()));
return c;
}
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
final StructuredProperty structuredProperty = new StructuredProperty();
structuredProperty.setValue(sp.getValue());
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo()));
return structuredProperty;
}
public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) {
final ExtraInfo entity = new ExtraInfo();
entity.setName(extraInfo.getName());
entity.setTypology(extraInfo.getTypology());
entity.setProvenance(extraInfo.getProvenance());
entity.setTrust(extraInfo.getTrust());
entity.setValue(extraInfo.getValue());
return entity;
}
public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) {
final OAIProvenance entity = new OAIProvenance();
entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription()));
return entity;
}
public static OriginDescription mapOriginalDescription(FieldTypeProtos.OAIProvenance.OriginDescription originDescription) {
final OriginDescription originDescriptionResult = new OriginDescription();
originDescriptionResult.setHarvestDate(originDescription.getHarvestDate());
originDescriptionResult.setAltered(originDescription.getAltered());
originDescriptionResult.setBaseURL(originDescription.getBaseURL());
originDescriptionResult.setIdentifier(originDescription.getIdentifier());
originDescriptionResult.setDatestamp(originDescription.getDatestamp());
originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace());
return originDescriptionResult;
}
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
final Field<String> stringField = new Field<>();
stringField.setValue(s.getValue());
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
return stringField;
}
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
final Field<Boolean> booleanField = new Field<>();
booleanField.setValue(b.getValue());
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
return booleanField;
}
public static Field<Integer> mapIntField(FieldTypeProtos.IntField b) {
final Field<Integer> entity = new Field<>();
entity.setValue(b.getValue());
entity.setDataInfo(mapDataInfo(b.getDataInfo()));
return entity;
}
public static Journal mapJournal(FieldTypeProtos.Journal j) {
final Journal journal = new Journal();
journal.setConferencedate(j.getConferencedate());
journal.setConferenceplace(j.getConferenceplace());
journal.setEdition(j.getEdition());
journal.setEp(j.getEp());
journal.setIss(j.getIss());
journal.setIssnLinking(j.getIssnLinking());
journal.setIssnOnline(j.getIssnOnline());
journal.setIssnPrinted(j.getIssnPrinted());
journal.setName(j.getName());
journal.setSp(j.getSp());
journal.setVol(j.getVol());
journal.setDataInfo(mapDataInfo(j.getDataInfo()));
return journal;
}
public static Author mapAuthor(FieldTypeProtos.Author author) {
final Author entity = new Author();
entity.setFullname(author.getFullname());
entity.setName(author.getName());
entity.setSurname(author.getSurname());
entity.setRank(author.getRank());
entity.setPid(author.getPidList()
.stream()
.map(kv -> {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(kv.getValue());
final Qualifier q = new Qualifier();
q.setClassid(kv.getKey());
q.setClassname(kv.getKey());
sp.setQualifier(q);
return sp;
})
.collect(Collectors.toList()));
entity.setAffiliation(author.getAffiliationList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
return entity;
}
public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) {
final GeoLocation entity = new GeoLocation();
entity.setPoint(geoLocation.getPoint());
entity.setBox(geoLocation.getBox());
entity.setPlace(geoLocation.getPlace());
return entity;
}
}

View File

@ -0,0 +1,194 @@
package eu.dnetlib.dhp.migration.actions;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.protobuf.InvalidProtocolBufferException;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.io.IOException;
import java.io.Serializable;
import java.util.LinkedList;
public class TransformActions implements Serializable {
private static final Log log = LogFactory.getLog(TransformActions.class);
private static final String SEPARATOR = "/";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateActionSet.class.getResourceAsStream(
"/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json")));
parser.parseArgument(args);
new TransformActions().run(parser);
}
private void run(ArgumentApplicationParser parser) throws ISLookUpException, IOException {
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: " + isLookupUrl);
final String inputPaths = parser.get("inputPaths");
if (StringUtils.isBlank(inputPaths)) {
throw new RuntimeException("empty inputPaths");
}
log.info("inputPaths: " + inputPaths);
final String targetBaseDir = getTargetBaseDir(isLookupUrl);
try(SparkSession spark = getSparkSession(parser)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
for(String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
final String rawset = pathQ.pollLast();
final String actionSetDirectory = pathQ.pollLast();
final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset);
if (fs.exists(targetDirectory)) {
log.info(String.format("found target directory '%s", targetDirectory));
fs.delete(targetDirectory, true);
log.info(String.format("deleted target directory '%s", targetDirectory));
}
log.info(String.format("transforming actions from '%s' to '%s'", sourcePath, targetDirectory));
sc.sequenceFile(sourcePath, Text.class, Text.class)
.mapToPair(a -> new Tuple2<>(a._1(), eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString())))
.mapToPair(a -> new Tuple2<>(a._1(), transformAction(a._1().toString(), a._2())))
.filter(t -> StringUtils.isNotBlank(t._2().toString()))
.saveAsHadoopFile(targetDirectory.toString(), Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}
}
}
private Text transformAction(String atomicaActionId, eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException, JsonProcessingException {
final Text out = new Text();
final ObjectMapper mapper = new ObjectMapper();
if (aa.getTargetValue() != null && aa.getTargetValue().length > 0) {
out.set(mapper.writeValueAsString(doTransform(aa)));
} else {
if (atomicaActionId.contains("dedupSimilarity")) {
out.set(mapper.writeValueAsString(getRelationAtomicAction(atomicaActionId)));
}
}
return out;
}
private AtomicAction<Relation> getRelationAtomicAction(String atomicaActionId) {
final String[] splitId = atomicaActionId.split("@");
String source = splitId[0];
String target = splitId[2];
String[] relSemantic = splitId[1].split("_");
Relation rel = new Relation();
rel.setSource(source);
rel.setTarget(target);
rel.setRelType(relSemantic[0]);
rel.setSubRelType(relSemantic[1]);
rel.setRelClass(relSemantic[2]);
DataInfo d = new DataInfo();
d.setDeletedbyinference(false);
d.setInferenceprovenance("deduplication");
d.setInferred(true);
d.setInvisible(false);
Qualifier provenanceaction = new Qualifier();
provenanceaction.setClassid("deduplication");
provenanceaction.setClassname("deduplication");
provenanceaction.setSchemeid("dnet:provenanceActions");
provenanceaction.setSchemename("dnet:provenanceActions");
d.setProvenanceaction(provenanceaction);
rel.setDataInfo(d);
return new AtomicAction<>(Relation.class, rel);
}
private AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException {
final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue());
final Oaf oaf = ProtoConverter.convert(proto_oaf);
switch (proto_oaf.getKind()) {
case entity:
switch (proto_oaf.getEntity().getType()) {
case datasource:
return new AtomicAction<>(Datasource.class, (Datasource) oaf);
case organization:
return new AtomicAction<>(Organization.class, (Organization) oaf);
case project:
return new AtomicAction<>(Project.class, (Project) oaf);
case result:
final String resulttypeid = proto_oaf.getEntity().getResult().getMetadata().getResulttype().getClassid();
switch (resulttypeid) {
case "publication":
return new AtomicAction<>(Publication.class, (Publication) oaf);
case "software":
return new AtomicAction<>(Software.class, (Software) oaf);
case "other":
return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf);
case "dataset":
return new AtomicAction<>(Dataset.class, (Dataset) oaf);
default:
// can be an update, where the resulttype is not specified
return new AtomicAction<>(Result.class, (Result) oaf);
}
default:
throw new IllegalArgumentException("invalid entity type: " + proto_oaf.getEntity().getType());
}
case relation:
return new AtomicAction<>(Relation.class, (Relation) oaf);
default:
throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind());
}
}
private String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
return isLookUp.getResourceProfileByQuery(XQUERY);
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
return SparkSession
.builder()
.appName(TransformActions.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.enableHiveSupport()
.getOrCreate();
}
}

View File

@ -1,4 +1,14 @@
package eu.dnetlib.dhp.migration; package eu.dnetlib.dhp.migration.step1;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.asString;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
@ -17,6 +27,8 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
import eu.dnetlib.dhp.migration.utils.DbClient;
import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Dataset;
@ -34,7 +46,7 @@ import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor implements Closeable { public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions"); qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");
@ -56,12 +68,10 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
final String dbPassword = parser.get("postgresPassword"); final String dbPassword = parser.get("postgresPassword");
final String hdfsPath = parser.get("hdfsPath"); final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("namenode");
final String hdfsUser = parser.get("hdfsUser");
final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims");
try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, hdfsNameNode, hdfsUser, dbUrl, dbUser, dbPassword)) { try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) {
if (processClaims) { if (processClaims) {
log.info("Processing claims..."); log.info("Processing claims...");
smdbe.execute("queryClaims.sql", smdbe::processClaims); smdbe.execute("queryClaims.sql", smdbe::processClaims);
@ -85,9 +95,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
} }
} }
public MigrateDbEntitiesApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String dbUrl, final String dbUser, public MigrateDbEntitiesApplication(final String hdfsPath, final String dbUrl, final String dbUser,
final String dbPassword) throws Exception { final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser); super(hdfsPath);
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
this.lastUpdateTimestamp = new Date().getTime(); this.lastUpdateTimestamp = new Date().getTime();
} }
@ -105,7 +115,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
final Datasource ds = new Datasource(); final Datasource ds = new Datasource();
ds.setId(createOpenaireId(10, rs.getString("datasourceid"))); ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); ds.setOriginalId(Arrays.asList(rs.getString("datasourceid")));
ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
ds.setPid(new ArrayList<>()); ds.setPid(new ArrayList<>());
@ -212,7 +222,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
final Project p = new Project(); final Project p = new Project();
p.setId(createOpenaireId(40, rs.getString("projectid"))); p.setId(createOpenaireId(40, rs.getString("projectid"), true));
p.setOriginalId(Arrays.asList(rs.getString("projectid"))); p.setOriginalId(Arrays.asList(rs.getString("projectid")));
p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
p.setPid(new ArrayList<>()); p.setPid(new ArrayList<>());
@ -302,7 +312,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
final Organization o = new Organization(); final Organization o = new Organization();
o.setId(createOpenaireId(20, rs.getString("organizationid"))); o.setId(createOpenaireId(20, rs.getString("organizationid"), true));
o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); o.setOriginalId(Arrays.asList(rs.getString("organizationid")));
o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
o.setPid(new ArrayList<>()); o.setPid(new ArrayList<>());
@ -366,8 +376,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
try { try {
final DataInfo info = prepareDataInfo(rs); final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("organization")); final String orgId = createOpenaireId(20, rs.getString("organization"), true);
final String dsId = createOpenaireId(10, rs.getString("datasource")); final String dsId = createOpenaireId(10, rs.getString("datasource"), true);
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
final Relation r1 = new Relation(); final Relation r1 = new Relation();
@ -415,8 +425,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
try { try {
final DataInfo info = prepareDataInfo(rs); final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("resporganization")); final String orgId = createOpenaireId(20, rs.getString("resporganization"), true);
final String projectId = createOpenaireId(40, rs.getString("project")); final String projectId = createOpenaireId(40, rs.getString("project"), true);
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
final Relation r1 = new Relation(); final Relation r1 = new Relation();
@ -481,14 +491,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor impl
} else { } else {
r = new Publication(); r = new Publication();
} }
r.setId(createOpenaireId(50, rs.getString("target_id"))); r.setId(createOpenaireId(50, rs.getString("target_id"), false));
r.setLastupdatetimestamp(lastUpdateTimestamp); r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setContext(prepareContext(rs.getString("source_id"), info)); r.setContext(prepareContext(rs.getString("source_id"), info));
r.setDataInfo(info); r.setDataInfo(info);
emitOaf(r); emitOaf(r);
} else { } else {
final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id")); final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false);
final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id")); final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false);
final Relation r1 = new Relation(); final Relation r1 = new Relation();
final Relation r2 = new Relation(); final Relation r2 = new Relation();

View File

@ -0,0 +1,67 @@
package eu.dnetlib.dhp.migration.step1;
import java.io.Closeable;
import java.io.IOException;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
import eu.dnetlib.dhp.migration.utils.MdstoreClient;
public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable {
private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class);
private final MdstoreClient mdstoreClient;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
parser.parseArgument(args);
final String mongoBaseUrl = parser.get("mongoBaseUrl");
final String mongoDb = parser.get("mongoDb");
final String mdFormat = parser.get("mdFormat");
final String mdLayout = parser.get("mdLayout");
final String mdInterpretation = parser.get("mdInterpretation");
final String hdfsPath = parser.get("hdfsPath");
try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) {
app.execute(mdFormat, mdLayout, mdInterpretation);
}
}
public MigrateMongoMdstoresApplication(final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception {
super(hdfsPath);
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
}
public void execute(final String format, final String layout, final String interpretation) {
final Map<String, String> colls = mdstoreClient.validCollections(format, layout, interpretation);
log.info("Found " + colls.size() + " mdstores");
for (final Entry<String, String> entry : colls.entrySet()) {
log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")");
final String currentColl = entry.getValue();
for (final String xml : mdstoreClient.listRecords(currentColl)) {
emit(xml, "native_" + format);
}
}
}
@Override
public void close() throws IOException {
super.close();
mdstoreClient.close();
}
}

View File

@ -1,20 +1,24 @@
package eu.dnetlib.dhp.migration; package eu.dnetlib.dhp.migration.step2;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.keyValue;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.oaiIProvenance;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentFactory; import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper; import org.dom4j.DocumentHelper;
import org.dom4j.Node; import org.dom4j.Node;
@ -37,11 +41,9 @@ import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { public abstract class AbstractMdRecordToOafMapper {
protected final Map<String, String> code2name = new HashMap<>(); protected final Map<String, String> code2name;
protected final MdstoreClient mdstoreClient;
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
@ -51,79 +53,36 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
private static final Log log = LogFactory.getLog(AbstractMongoExecutor.class); protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) {
this.code2name = code2name;
public AbstractMongoExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl,
final String mongoDb, final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser);
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
loadClassNames(dbUrl, dbUser, dbPassword);
final Map<String, String> nsContext = new HashMap<>();
registerNamespaces(nsContext);
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
} }
private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException { public List<Oaf> processMdRecord(final String xml) {
try {
final Map<String, String> nsContext = new HashMap<>();
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
nsContext.put("datacite", "http://datacite.org/schema/kernel-3");
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
log.info("Loading vocabulary terms from db..."); final Document doc = DocumentHelper.parseText(xml);
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { final String type = doc.valueOf("//dr:CobjCategory/@type");
code2name.clear(); final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
dbClient.processResults("select code, name from class", rs -> { final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
try { : keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name"));
code2name.put(rs.getString("code"), rs.getString("name"));
} catch (final SQLException e) { final DataInfo info = prepareDataInfo(doc);
e.printStackTrace(); final long lastUpdateTimestamp = new Date().getTime();
}
}); return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
} catch (final Exception e) {
throw new RuntimeException(e);
} }
log.info("Found " + code2name.size() + " terms.");
}
public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException {
log.info(String.format("Searching mdstores (format: %s, layout: %s, interpretation: %s)", mdFormat, mdLayout, mdInterpretation));
final Map<String, String> colls = mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation);
log.info("Found " + colls.size() + " mdstores");
for (final Entry<String, String> entry : colls.entrySet()) {
log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")");
final String currentColl = entry.getValue();
for (final String xml : mdstoreClient.listRecords(currentColl)) {
final Document doc = DocumentHelper.parseText(xml);
final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
: keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name"));
final DataInfo info = prepareDataInfo(doc);
final long lastUpdateTimestamp = new Date().getTime();
for (final Oaf oaf : createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp)) {
emitOaf(oaf);
}
}
}
log.info("All Done.");
}
protected void registerNamespaces(final Map<String, String> nsContext) {
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
} }
protected List<Oaf> createOafs(final Document doc, protected List<Oaf> createOafs(final Document doc,
@ -194,10 +153,10 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
final List<Oaf> res = new ArrayList<>(); final List<Oaf> res = new ArrayList<>();
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
for (final Object o : doc.selectNodes("//oaf:projectid")) { for (final Object o : doc.selectNodes("//oaf:projectid")) {
final String projectId = createOpenaireId(40, ((Node) o).getText()); final String projectId = createOpenaireId(40, ((Node) o).getText(), true);
final Relation r1 = new Relation(); final Relation r1 = new Relation();
r1.setRelType("resultProject"); r1.setRelType("resultProject");
@ -238,7 +197,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
final long lastUpdateTimestamp) { final long lastUpdateTimestamp) {
r.setDataInfo(info); r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp); r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"))); r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
r.setCollectedfrom(Arrays.asList(collectedFrom)); r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
@ -432,10 +391,4 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
return res; return res;
} }
@Override
public void close() throws IOException {
super.close();
mdstoreClient.close();
}
} }

View File

@ -0,0 +1,173 @@
package eu.dnetlib.dhp.migration.step2;
import java.io.IOException;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
import eu.dnetlib.dhp.migration.utils.DbClient;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import scala.Tuple2;
public class GenerateEntitiesApplication {
private static final Log log = LogFactory.getLog(GenerateEntitiesApplication.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/migration/generate_entities_parameters.json")));
parser.parseArgument(args);
final String sourcePaths = parser.get("sourcePaths");
final String targetPath = parser.get("targetPath");
final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword");
final Map<String, String> code2name = loadClassNames(dbUrl, dbUser, dbPassword);
try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
final List<String> existingSourcePaths = Arrays.stream(sourcePaths.split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList());
generateEntities(sc, code2name, existingSourcePaths, targetPath);
}
}
private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
return SparkSession
.builder()
.appName(GenerateEntitiesApplication.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
}
private static void generateEntities(final JavaSparkContext sc,
final Map<String, String> code2name,
final List<String> sourcePaths,
final String targetPath) {
log.info("Generate entities from files:");
sourcePaths.forEach(log::info);
JavaRDD<String> inputRdd = sc.emptyRDD();
for (final String sp : sourcePaths) {
inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
.map(k -> convertToListOaf(k._1(), k._2(), code2name))
.flatMap(list -> list.iterator())
.map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + "|" + convertToJson(oaf)));
}
inputRdd.saveAsTextFile(targetPath, GzipCodec.class);
}
private static List<Oaf> convertToListOaf(final String id, final String s, final Map<String, String> code2name) {
final String type = StringUtils.substringAfter(id, ":");
switch (type.toLowerCase()) {
case "native_oaf":
return new OafToOafMapper(code2name).processMdRecord(s);
case "native_odf":
return new OdfToOafMapper(code2name).processMdRecord(s);
case "datasource":
return Arrays.asList(convertFromJson(s, Datasource.class));
case "organization":
return Arrays.asList(convertFromJson(s, Organization.class));
case "project":
return Arrays.asList(convertFromJson(s, Project.class));
case "relation":
return Arrays.asList(convertFromJson(s, Relation.class));
case "publication":
return Arrays.asList(convertFromJson(s, Publication.class));
case "dataset":
return Arrays.asList(convertFromJson(s, Dataset.class));
case "software":
return Arrays.asList(convertFromJson(s, Software.class));
case "otherresearchproducts":
default:
return Arrays.asList(convertFromJson(s, OtherResearchProduct.class));
}
}
private static Map<String, String> loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
log.info("Loading vocabulary terms from db...");
final Map<String, String> map = new HashMap<>();
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
dbClient.processResults("select code, name from class", rs -> {
try {
map.put(rs.getString("code"), rs.getString("name"));
} catch (final SQLException e) {
e.printStackTrace();
}
});
}
log.info("Found " + map.size() + " terms.");
return map;
}
private static String convertToJson(final Oaf oaf) {
try {
return new ObjectMapper().writeValueAsString(oaf);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
private static Oaf convertFromJson(final String s, final Class<? extends Oaf> clazz) {
try {
return new ObjectMapper().readValue(s, clazz);
} catch (final Exception e) {
log.error("Error parsing object of class: " + clazz);
log.error(s);
throw new RuntimeException(e);
}
}
private static boolean exists(final JavaSparkContext context, final String pathToFile) {
try {
final FileSystem hdfs = org.apache.hadoop.fs.FileSystem.get(context.hadoopConfiguration());
final Path path = new Path(pathToFile);
return hdfs.exists(path);
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -1,16 +1,17 @@
package eu.dnetlib.dhp.migration; package eu.dnetlib.dhp.migration.step2;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.Node; import org.dom4j.Node;
import eu.dnetlib.dhp.migration.pace.PacePerson; import eu.dnetlib.dhp.migration.utils.PacePerson;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Field;
@ -22,20 +23,10 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OafMigrationExecutor extends AbstractMongoExecutor { public class OafToOafMapper extends AbstractMdRecordToOafMapper {
private static final Log log = LogFactory.getLog(OafMigrationExecutor.class); public OafToOafMapper(final Map<String, String> code2name) {
super(code2name);
public OafMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
}
@Override
protected void registerNamespaces(final Map<String, String> nsContext) {
super.registerNamespaces(nsContext);
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
} }
@Override @Override
@ -211,12 +202,12 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
final KeyValue collectedFrom, final KeyValue collectedFrom,
final DataInfo info, final DataInfo info,
final long lastUpdateTimestamp) { final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
final List<Oaf> res = new ArrayList<>(); final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
final String otherId = createOpenaireId(50, ((Node) o).getText()); final String otherId = createOpenaireId(50, ((Node) o).getText(), false);
final Relation r1 = new Relation(); final Relation r1 = new Relation();
r1.setRelType("resultResult"); r1.setRelType("resultResult");

View File

@ -1,4 +1,8 @@
package eu.dnetlib.dhp.migration; package eu.dnetlib.dhp.migration.step2;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -6,8 +10,6 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.Node; import org.dom4j.Node;
@ -22,38 +24,28 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OdfMigrationExecutor extends AbstractMongoExecutor { public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
private static final Log log = LogFactory.getLog(OdfMigrationExecutor.class); public OdfToOafMapper(final Map<String, String> code2name) {
super(code2name);
public OdfMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
}
@Override
protected void registerNamespaces(final Map<String, String> nsContext) {
super.registerNamespaces(nsContext);
nsContext.put("dc", "http://datacite.org/schema/kernel-3");
} }
@Override @Override
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info);
} }
@Override @Override
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) { protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
final List<Author> res = new ArrayList<>(); final List<Author> res = new ArrayList<>();
int pos = 1; int pos = 1;
for (final Object o : doc.selectNodes("//dc:creator")) { for (final Object o : doc.selectNodes("//datacite:creator")) {
final Node n = (Node) o; final Node n = (Node) o;
final Author author = new Author(); final Author author = new Author();
author.setFullname(n.valueOf("./dc:creatorName")); author.setFullname(n.valueOf("./datacite:creatorName"));
author.setName(n.valueOf("./dc:givenName")); author.setName(n.valueOf("./datacite:givenName"));
author.setSurname(n.valueOf("./dc:familyName")); author.setSurname(n.valueOf("./datacite:familyName"));
author.setAffiliation(prepareListFields(doc, "./dc:affiliation", info)); author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info));
author.setPid(preparePids(doc, info)); author.setPid(preparePids(doc, info));
author.setRank(pos++); author.setRank(pos++);
res.add(author); res.add(author);
@ -63,7 +55,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
private List<StructuredProperty> preparePids(final Document doc, final DataInfo info) { private List<StructuredProperty> preparePids(final Document doc, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes("./dc:nameIdentifier")) { for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) {
res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info)); res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info));
} }
return res; return res;
@ -72,7 +64,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
@Override @Override
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
final List<Instance> res = new ArrayList<>(); final List<Instance> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:alternateIdentifier[@alternateIdentifierType='URL']")) { for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
final Instance instance = new Instance(); final Instance instance = new Instance();
instance.setUrl(Arrays.asList(((Node) o).getText().trim())); instance.setUrl(Arrays.asList(((Node) o).getText().trim()));
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
@ -98,7 +90,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
@Override @Override
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:date")) { for (final Object o : doc.selectNodes("//datacite:date")) {
final String dateType = ((Node) o).valueOf("@dateType"); final String dateType = ((Node) o).valueOf("@dateType");
if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued") if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued")
&& !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) {
@ -115,32 +107,32 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
@Override @Override
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) { protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributorName", info); return prepareListFields(doc, "//datacite:contributorName", info);
} }
@Override @Override
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) { protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:format", info); return prepareListFields(doc, "//datacite:format", info);
} }
@Override @Override
protected Field<String> preparePublisher(final Document doc, final DataInfo info) { protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:publisher", info); return prepareField(doc, "//datacite:publisher", info);
} }
@Override @Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) { protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:description[@descriptionType='Abstract']", info); return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info);
} }
@Override @Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:subject", info); return prepareListStructProps(doc, "//datacite:subject", info);
} }
@Override @Override
protected Qualifier prepareLanguages(final Document doc) { protected Qualifier prepareLanguages(final Document doc) {
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages");
} }
@Override @Override
@ -150,17 +142,17 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
@Override @Override
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactGroup']/dc:contributorName", info); return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info);
} }
@Override @Override
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactPerson']/dc:contributorName", info); return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info);
} }
@Override @Override
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
return prepareQualifier(doc, "//dc:format", "dnet:programming_languages", "dnet:programming_languages"); return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages");
} }
@Override @Override
@ -175,7 +167,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
@Override @Override
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
} }
// DATASETS // DATASETS
@ -184,11 +176,11 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) { protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
final List<GeoLocation> res = new ArrayList<>(); final List<GeoLocation> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:geoLocation")) { for (final Object o : doc.selectNodes("//datacite:geoLocation")) {
final GeoLocation loc = new GeoLocation(); final GeoLocation loc = new GeoLocation();
loc.setBox(((Node) o).valueOf("./dc:geoLocationBox")); loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox"));
loc.setPlace(((Node) o).valueOf("./dc:geoLocationPlace")); loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace"));
loc.setPoint(((Node) o).valueOf("./dc:geoLocationPoint")); loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint"));
res.add(loc); res.add(loc);
} }
return res; return res;
@ -201,17 +193,17 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
@Override @Override
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:date[@dateType='Updated']", info); return prepareField(doc, "//datacite:date[@dateType='Updated']", info);
} }
@Override @Override
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:version", info); return prepareField(doc, "//datacite:version", info);
} }
@Override @Override
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:size", info); return prepareField(doc, "//datacite:size", info);
} }
@Override @Override
@ -221,18 +213,18 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
@Override @Override
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:date[@dateType='Issued']", info); return prepareField(doc, "//datacite:date[@dateType='Issued']", info);
} }
@Override @Override
protected List<Oaf> addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { protected List<Oaf> addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
final List<Oaf> res = new ArrayList<>(); final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name() = 'resource']//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) { for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) {
final String otherId = createOpenaireId(50, ((Node) o).getText()); final String otherId = createOpenaireId(50, ((Node) o).getText(), false);
final String type = ((Node) o).valueOf("@relationType"); final String type = ((Node) o).valueOf("@relationType");
if (type.equals("IsSupplementTo")) { if (type.equals("IsSupplementTo")) {

View File

@ -0,0 +1,71 @@
package eu.dnetlib.dhp.migration.step3;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
public class DispatchEntitiesApplication {
private static final Log log = LogFactory.getLog(DispatchEntitiesApplication.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json")));
parser.parseArgument(args);
try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
final String sourcePath = parser.get("sourcePath");
final String targetPath = parser.get("graphRawPath");
processEntity(sc, Publication.class, sourcePath, targetPath);
processEntity(sc, Dataset.class, sourcePath, targetPath);
processEntity(sc, Software.class, sourcePath, targetPath);
processEntity(sc, OtherResearchProduct.class, sourcePath, targetPath);
processEntity(sc, Datasource.class, sourcePath, targetPath);
processEntity(sc, Organization.class, sourcePath, targetPath);
processEntity(sc, Project.class, sourcePath, targetPath);
processEntity(sc, Relation.class, sourcePath, targetPath);
}
}
private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
return SparkSession
.builder()
.appName(DispatchEntitiesApplication.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
}
private static void processEntity(final JavaSparkContext sc, final Class<?> clazz, final String sourcePath, final String targetPath) {
final String type = clazz.getSimpleName().toLowerCase();
log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath));
sc.textFile(sourcePath)
.filter(l -> isEntityType(l, type))
.map(l -> StringUtils.substringAfter(l, "|"))
.saveAsTextFile(targetPath + "/" + type, GzipCodec.class); // use repartition(XXX) ???
}
private static boolean isEntityType(final String line, final String type) {
return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type);
}
}

View File

@ -0,0 +1,77 @@
package eu.dnetlib.dhp.migration.utils;
import java.io.Closeable;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.codehaus.jackson.map.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Oaf;
public class AbstractMigrationApplication implements Closeable {
private final AtomicInteger counter = new AtomicInteger(0);
private final Text key = new Text();
private final Text value = new Text();
private final SequenceFile.Writer writer;
private final ObjectMapper objectMapper = new ObjectMapper();
private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class);
public AbstractMigrationApplication(final String hdfsPath) throws Exception {
log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath));
this.writer = SequenceFile.createWriter(getConf(), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class));
}
private Configuration getConf() throws IOException {
final Configuration conf = new Configuration();
/*
* conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
* conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser);
* System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf);
*/
return conf;
}
protected void emit(final String s, final String type) {
try {
key.set(counter.getAndIncrement() + ":" + type);
value.set(s);
writer.append(key, value);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
protected void emitOaf(final Oaf oaf) {
try {
emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase());
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public ObjectMapper getObjectMapper() {
return objectMapper;
}
@Override
public void close() throws IOException {
writer.hflush();
writer.close();
}
}

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dhp.migration; package eu.dnetlib.dhp.migration.utils;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
@ -28,8 +28,8 @@ public class DbClient implements Closeable {
StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address); StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address);
this.connection.setAutoCommit(false); this.connection.setAutoCommit(false);
} catch (final Exception e) { } catch (final Exception e) {
log.error(e.getClass().getName() + ": " + e.getMessage()); log.error("Connection to postgresDB failed");
throw new RuntimeException(e); throw new RuntimeException("Connection to postgresDB failed", e);
} }
log.info("Opened database successfully"); log.info("Opened database successfully");
} }
@ -44,10 +44,12 @@ public class DbClient implements Closeable {
consumer.accept(rs); consumer.accept(rs);
} }
} catch (final SQLException e) { } catch (final SQLException e) {
throw new RuntimeException(e); log.error("Error executing sql query: " + sql, e);
throw new RuntimeException("Error executing sql query", e);
} }
} catch (final SQLException e1) { } catch (final SQLException e1) {
throw new RuntimeException(e1); log.error("Error preparing sql statement", e1);
throw new RuntimeException("Error preparing sql statement", e1);
} }
} }

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dhp.migration; package eu.dnetlib.dhp.migration.utils;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;

View File

@ -1,24 +1,12 @@
package eu.dnetlib.dhp.migration; package eu.dnetlib.dhp.migration.utils;
import java.io.Closeable;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.codehaus.jackson.map.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.ExtraInfo; import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
@ -26,60 +14,12 @@ import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal; import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance; import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OriginDescription; import eu.dnetlib.dhp.schema.oaf.OriginDescription;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
public class AbstractMigrationExecutor implements Closeable { public class OafMapperUtils {
private final AtomicInteger counter = new AtomicInteger(0);
private final Text key = new Text();
private final Text value = new Text();
private final ObjectMapper objectMapper = new ObjectMapper();
private final SequenceFile.Writer writer;
private static final Log log = LogFactory.getLog(AbstractMigrationExecutor.class);
public AbstractMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception {
log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s, nameNode=%s, user=%s", hdfsPath, hdfsNameNode, hdfsUser));
this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class));
}
private Configuration getConf(final String hdfsNameNode, final String hdfsUser) throws IOException {
final Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("HADOOP_USER_NAME", hdfsUser);
System.setProperty("hadoop.home.dir", "/");
FileSystem.get(URI.create(hdfsNameNode), conf);
return conf;
}
protected void emitOaf(final Oaf oaf) {
try {
key.set(counter.getAndIncrement() + ":" + oaf.getClass().getSimpleName().toLowerCase());
value.set(objectMapper.writeValueAsString(oaf));
writer.append(key, value);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
@Override
public void close() throws IOException {
writer.hflush();
writer.close();
}
public static KeyValue keyValue(final String k, final String v) { public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue(); final KeyValue kv = new KeyValue();
@ -223,28 +163,33 @@ public class AbstractMigrationExecutor implements Closeable {
return d; return d;
} }
public static String createOpenaireId(final int prefix, final String originalId) { public static String createOpenaireId(final int prefix, final String originalId, final boolean to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::"); if (to_md5) {
final String rest = StringUtils.substringAfter(originalId, "::"); final String nsPrefix = StringUtils.substringBefore(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
} else {
return String.format("%s|%s", prefix, originalId);
}
} }
public static String createOpenaireId(final String type, final String originalId) { public static String createOpenaireId(final String type, final String originalId, final boolean to_md5) {
switch (type) { switch (type) {
case "datasource": case "datasource":
return createOpenaireId(10, originalId); return createOpenaireId(10, originalId, to_md5);
case "organization": case "organization":
return createOpenaireId(20, originalId); return createOpenaireId(20, originalId, to_md5);
case "person": case "person":
return createOpenaireId(30, originalId); return createOpenaireId(30, originalId, to_md5);
case "project": case "project":
return createOpenaireId(40, originalId); return createOpenaireId(40, originalId, to_md5);
default: default:
return createOpenaireId(50, originalId); return createOpenaireId(50, originalId, to_md5);
} }
} }
public static String asString(final Object o) { public static String asString(final Object o) {
return o == null ? "" : o.toString(); return o == null ? "" : o.toString();
} }
} }

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dhp.migration.pace; package eu.dnetlib.dhp.migration.utils;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.text.Normalizer; import java.text.Normalizer;

View File

@ -1,8 +1,8 @@
[ [
{ {
"paramName": "s", "paramName": "s",
"paramLongName": "sourcePaths", "paramLongName": "sourcePath",
"paramDescription": "the HDFS source paths which contains the sequential file (comma separated)", "paramDescription": "the source path",
"paramRequired": true "paramRequired": true
}, },
{ {

View File

@ -0,0 +1,39 @@
[
{
"paramName": "s",
"paramLongName": "sourcePaths",
"paramDescription": "the HDFS source paths which contains the sequential file (comma separated)",
"paramRequired": true
},
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the path of the target file",
"paramRequired": true
},
{
"paramName": "pgurl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "pguser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "pgpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
}
]

View File

@ -0,0 +1,10 @@
[
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true},
{"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true},
{"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true},
{"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true},
{"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true},
{"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true}
]

View File

@ -6,31 +6,19 @@
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "n", "paramName": "pgurl",
"paramLongName": "namenode",
"paramDescription": "the Name Node URI",
"paramRequired": true
},
{
"paramName": "u",
"paramLongName": "hdfsUser",
"paramDescription": "the user wich create the hdfs seq file",
"paramRequired": true
},
{
"paramName": "dburl",
"paramLongName": "postgresUrl", "paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "dbuser", "paramName": "pguser",
"paramLongName": "postgresUser", "paramLongName": "postgresUser",
"paramDescription": "postgres user", "paramDescription": "postgres user",
"paramRequired": false "paramRequired": false
}, },
{ {
"paramName": "dbpasswd", "paramName": "pgpasswd",
"paramLongName": "postgresPassword", "paramLongName": "postgresPassword",
"paramDescription": "postgres password", "paramDescription": "postgres password",
"paramRequired": false "paramRequired": false

View File

@ -5,18 +5,6 @@
"paramDescription": "the path where storing the sequential file", "paramDescription": "the path where storing the sequential file",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "n",
"paramLongName": "namenode",
"paramDescription": "the Name Node URI",
"paramRequired": true
},
{
"paramName": "u",
"paramLongName": "hdfsUser",
"paramDescription": "the user wich create the hdfs seq file",
"paramRequired": true
},
{ {
"paramName": "mongourl", "paramName": "mongourl",
"paramLongName": "mongoBaseUrl", "paramLongName": "mongoBaseUrl",
@ -24,7 +12,7 @@
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "db", "paramName": "mongodb",
"paramLongName": "mongoDb", "paramLongName": "mongoDb",
"paramDescription": "mongo database", "paramDescription": "mongo database",
"paramRequired": true "paramRequired": true
@ -46,23 +34,5 @@
"paramLongName": "mdInterpretation", "paramLongName": "mdInterpretation",
"paramDescription": "metadata interpretation", "paramDescription": "metadata interpretation",
"paramRequired": true "paramRequired": true
},
{
"paramName": "pgurl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "pguser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "pgpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
} }
] ]

View File

@ -1,206 +0,0 @@
<workflow-app name="import Entities from aggretor to HDFS" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>graphRawPath</name>
<description>the graph Raw base path</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>mongourl</name>
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
</property>
<property>
<name>mongoDb</name>
<description>mongo database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}'/>
<mkdir path='${workingPath}'/>
</fs>
<ok to="ImportEntitiesFromPostgres"/>
<error to="Kill"/>
</action>
<action name="ImportEntitiesFromPostgres">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${workingPath}/db_entities</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-dburl</arg><arg>${postgresURL}</arg>
<arg>-dbuser</arg><arg>${postgresUser}</arg>
<arg>-dbpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportClaimsFromPostgres"/>
<error to="Kill"/>
</action>
<action name="ImportClaimsFromPostgres">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${workingPath}/db_claims</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-dburl</arg><arg>${postgresURL}</arg>
<arg>-dbuser</arg><arg>${postgresUser}</arg>
<arg>-dbpasswd</arg><arg>${postgresPassword}</arg>
<arg>-a</arg><arg>claims</arg>
</java>
<ok to="ImportODFEntitiesFromMongoDB"/>
<error to="Kill"/>
</action>
<action name="ImportODFEntitiesFromMongoDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/odf_entities</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-mongourl</arg><arg>${mongourl}</arg>
<arg>-db</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>ODF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportOAFEntitiesFromMongoDB"/>
<error to="Kill"/>
</action>
<action name="ImportOAFEntitiesFromMongoDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/oaf_entities</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-mongourl</arg><arg>${mongourl}</arg>
<arg>-db</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>OAF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportODFClaimsFromMongoDB"/>
<error to="Kill"/>
</action>
<action name="ImportODFClaimsFromMongoDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/odf_claims</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-mongourl</arg><arg>${mongourl}</arg>
<arg>-db</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>ODF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>claim</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportOAFClaimsFromMongoDB"/>
<error to="Kill"/>
</action>
<action name="ImportOAFClaimsFromMongoDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/oaf_claims</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-mongourl</arg><arg>${mongourl}</arg>
<arg>-db</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>OAF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>claim</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ResetGraphRawPath"/>
<error to="Kill"/>
</action>
<action name="ResetGraphRawPath">
<fs>
<delete path='${graphRawPath}'/>
<mkdir path='${graphRawPath}'/>
</fs>
<ok to="ExtractEntitiesInGraphRawPath"/>
<error to="Kill"/>
</action>
<action name="ExtractEntitiesInGraphRawPath">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${workingPath}/db_entities,${workingPath}/oaf_entities,${workingPath}/odf_entities</arg>
<arg>-g</arg><arg>${graphRawPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,5 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"i", "paramLongName":"inputPaths", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}
]

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>sourceNN</name>
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/applicationHistory</value>
</property>
</configuration>

View File

@ -0,0 +1,122 @@
<workflow-app xmlns='uri:oozie:workflow:0.5' name='migrate_actions'>
<parameters>
<property>
<name>sourceNN</name>
<description>the source name node</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the isLookup service endpoint</description>
</property>
<property>
<name>workingDirectory</name>
<value>/tmp/actionsets</value>
<description>working directory</description>
</property>
<property>
<name>distcp_memory_mb</name>
<value>6144</value>
<description>memory for distcp copying actionsets from remote cluster</description>
</property>
<property>
<name>distcp_task_timeout</name>
<value>60000000</value>
<description>timeout for distcp copying actions from remote cluster</description>
</property>
<property>
<name>distcp_num_maps</name>
<value>1</value>
<description>mmaximum number of map tasks used in the distcp process</description>
</property>
<property>
<name>transform_only</name>
<description>activate tranform-only mode. Only apply transformation step</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>
<start to='migrate_actionsets' />
<action name='migrate_actionsets'>
<java>
<main-class>eu.dnetlib.dhp.migration.actions.MigrateActionSet</main-class>
<java-opt>-Dmapred.task.timeout=${distcp_task_timeout}</java-opt>
<arg>-is</arg><arg>${isLookupUrl}</arg>
<arg>-sn</arg><arg>${sourceNN}</arg>
<arg>-tn</arg><arg>${nameNode}</arg>
<arg>-w</arg><arg>${workingDirectory}</arg>
<arg>-nm</arg><arg>${distcp_num_maps}</arg>
<arg>-mm</arg><arg>${distcp_memory_mb}</arg>
<arg>-tt</arg><arg>${distcp_task_timeout}</arg>
<arg>-tr</arg><arg>${transform_only}</arg>
<capture-output/>
</java>
<ok to="transform_actions" />
<error to="fail" />
</action>
<action name="transform_actions">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>transform_actions</name>
<class>eu.dnetlib.dhp.migration.actions.TransformActions</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores ${sparkExecutorCores}
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>-mt</arg><arg>yarn</arg>
<arg>-is</arg><arg>${isLookupUrl}</arg>
<arg>--inputPaths</arg><arg>${wf:actionData('migrate_actionsets')['target_paths']}</arg>
</spark>
<ok to="end"/>
<error to="fail"/>
</action>
<kill name="fail">
<message>migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end" />
</workflow-app>

View File

@ -15,8 +15,4 @@
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <value>spark2</value>
</property> </property>
<property>
<name>hdfsUser</name>
<value>dnet</value>
</property>
</configuration> </configuration>

View File

@ -0,0 +1,169 @@
<workflow-app name="import Claims as Graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>migrationClaimsPathStep1</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>migrationClaimsPathStep2</name>
<description>the temporary path to store entities before dispatching</description>
</property>
<property>
<name>migrationClaimsPathStep3</name>
<description>the graph Raw base path</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>mongoURL</name>
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
</property>
<property>
<name>mongoDb</name>
<description>mongo database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${migrationClaimsPathStep1}'/>
<mkdir path='${migrationClaimsPathStep1}'/>
</fs>
<ok to="ImportDBClaims"/>
<error to="Kill"/>
</action>
<action name="ImportDBClaims">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${migrationClaimsPathStep1}/db_claims</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
<arg>-a</arg><arg>claims</arg>
</java>
<ok to="ImportODFClaims"/>
<error to="Kill"/>
</action>
<action name="ImportODFClaims">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${migrationClaimsPathStep1}/odf_claims</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>ODF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>claim</arg>
</java>
<ok to="ImportOAFClaims"/>
<error to="Kill"/>
</action>
<action name="ImportOAFClaims">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${migrationClaimsPathStep1}/oaf_claims</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>OAF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>claim</arg>
</java>
<ok to="ResetClaimEntities"/>
<error to="Kill"/>
</action>
<action name="ResetClaimEntities">
<fs>
<delete path='${migrationClaimsPathStep2}'/>
<mkdir path='${migrationClaimsPathStep2}'/>
</fs>
<ok to="GenerateClaimEntities"/>
<error to="Kill"/>
</action>
<action name="GenerateClaimEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateClaimEntities</name>
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${migrationClaimsPathStep1}/db_claims,${migrationClaimsPathStep1}/oaf_claims,${migrationClaimsPathStep1}/odf_claims</arg>
<arg>-t</arg><arg>${migrationClaimsPathStep2}/claim_entities</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</spark>
<ok to="ResetClaimGraph"/>
<error to="Kill"/>
</action>
<action name="ResetClaimGraph">
<fs>
<delete path='${migrationClaimsPathStep3}'/>
<mkdir path='${migrationClaimsPathStep3}'/>
</fs>
<ok to="GenerateClaimGraph"/>
<error to="Kill"/>
</action>
<action name="GenerateClaimGraph">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateClaimGraph</name>
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${migrationClaimsPathStep2}/claim_entities</arg>
<arg>-g</arg><arg>${migrationClaimsPathStep3}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,197 @@
<workflow-app name="import regular entities as Graph (all steps)" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<value>/tmp/dhp_migration</value>
<description>the base path to store temporary intermediate data</description>
</property>
<property>
<name>graphBasePath</name>
<description>the target path to store raw graph</description>
</property>
<property>
<name>reuseContent</name>
<value>false</value>
<description>should import content from the aggregator or reuse a previous version</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>mongoURL</name>
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
</property>
<property>
<name>mongoDb</name>
<description>mongo database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>
<start to="ReuseContent"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<decision name="ReuseContent">
<switch>
<case to="ResetWorkingPath">${wf:conf('reuseContent') eq false}</case>
<case to="ResetAllEntitiesPath">${wf:conf('reuseContent') eq true}</case>
<default to="ResetWorkingPath"/>
</switch>
</decision>
<action name="ResetWorkingPath">
<fs>
<delete path="${workingPath}"/>
<mkdir path="${workingPath}"/>
</fs>
<ok to="ImportDB"/>
<error to="Kill"/>
</action>
<action name="ImportDB">
<java>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${workingPath}/db_records</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportODF"/>
<error to="Kill"/>
</action>
<action name="ImportODF">
<java>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/odf_records</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>ODF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
</java>
<ok to="ImportOAF"/>
<error to="Kill"/>
</action>
<action name="ImportOAF">
<java>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/oaf_records</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>OAF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
</java>
<ok to="ResetAllEntitiesPath"/>
<error to="Kill"/>
</action>
<action name="ResetAllEntitiesPath">
<fs>
<delete path="${workingPath}/all_entities"/>
</fs>
<ok to="GenerateEntities"/>
<error to="Kill"/>
</action>
<action name="GenerateEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateEntities</name>
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${workingPath}/db_records,${workingPath}/oaf_records,${workingPath}/odf_records</arg>
<arg>-t</arg><arg>${workingPath}/all_entities</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</spark>
<ok to="ResetGraphPath"/>
<error to="Kill"/>
</action>
<action name="ResetGraphPath">
<fs>
<delete path="${graphBasePath}/graph_raw"/>
<mkdir path="${graphBasePath}/graph_raw"/>
</fs>
<ok to="GenerateGraph"/>
<error to="Kill"/>
</action>
<action name="GenerateGraph">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateGraph</name>
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${workingPath}/all_entities</arg>
<arg>-g</arg><arg>${graphBasePath}/graph_raw</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,103 @@
<workflow-app name="import regular entities as Graph (step 1)" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>migrationPathStep1</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>mongoURL</name>
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
</property>
<property>
<name>mongoDb</name>
<description>mongo database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${migrationPathStep1}'/>
<mkdir path='${migrationPathStep1}'/>
</fs>
<ok to="ImportDB"/>
<error to="Kill"/>
</action>
<action name="ImportDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${migrationPathStep1}/db_records</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportODF"/>
<error to="Kill"/>
</action>
<action name="ImportODF">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${migrationPathStep1}/odf_records</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>ODF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
</java>
<ok to="ImportOAF"/>
<error to="Kill"/>
</action>
<action name="ImportOAF">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${migrationPathStep1}/oaf_records</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>OAF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,74 @@
<workflow-app name="import regular entities as Graph (step 2)" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>migrationPathStep1</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>migrationPathStep2</name>
<description>the temporary path to store entities before dispatching</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetEntities"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetEntities">
<fs>
<delete path='${migrationPathStep2}'/>
<mkdir path='${migrationPathStep2}'/>
</fs>
<ok to="GenerateEntities"/>
<error to="Kill"/>
</action>
<action name="GenerateEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateEntities</name>
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records</arg>
<arg>-t</arg><arg>${migrationPathStep2}/all_entities</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,60 @@
<workflow-app name="import regular entities as Graph (step 3)" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>migrationPathStep2</name>
<description>the temporary path to store entities before dispatching</description>
</property>
<property>
<name>migrationPathStep3</name>
<description>the graph Raw base path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetGraph"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetGraph">
<fs>
<delete path='${migrationPathStep3}'/>
<mkdir path='${migrationPathStep3}'/>
</fs>
<ok to="GenerateGraph"/>
<error to="Kill"/>
</action>
<action name="GenerateGraph">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateGraph</name>
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${migrationPathStep2}/all_entities</arg>
<arg>-g</arg><arg>${migrationPathStep3}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,79 +1,87 @@
package eu.dnetlib.dhp.collection; package eu.dnetlib.dhp.collection;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.*;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance;
public class CollectionJobTest { public class CollectionJobTest {
private Path testDir;
@Before private Path testDir;
public void setup() throws IOException {
testDir = Files.createTempDirectory("dhp-collection");
}
@After @Before
public void teadDown() throws IOException { public void setup() throws IOException {
FileUtils.deleteDirectory(testDir.toFile()); testDir = Files.createTempDirectory("dhp-collection");
} }
@Test @After
public void tesCollection() throws Exception { public void teadDown() throws IOException {
Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); FileUtils.deleteDirectory(testDir.toFile());
GenerateNativeStoreSparkJob.main(new String[] { }
"-mt", "local",
"-w", "wid",
"-e", "XML",
"-d", ""+System.currentTimeMillis(),
"-p", new ObjectMapper().writeValueAsString(provenance),
"-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']",
"-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(),
"-o", testDir.toString()+"/store",
"-t", "true",
"-ru", "",
"-rp", "",
"-rh", "",
"-ro", "",
"-rr", ""});
System.out.println(new ObjectMapper().writeValueAsString(provenance));
}
@Test
public void tesCollection() throws Exception {
final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
GenerateNativeStoreSparkJob.main(new String[] {
"-mt", "local",
"-w", "wid",
"-e", "XML",
"-d", "" + System.currentTimeMillis(),
"-p", new ObjectMapper().writeValueAsString(provenance),
"-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']",
"-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(),
"-o", testDir.toString() + "/store",
"-t", "true",
"-ru", "",
"-rp", "",
"-rh", "",
"-ro", "",
"-rr", "" });
System.out.println(new ObjectMapper().writeValueAsString(provenance));
}
@Test
public void testGenerationMetadataRecord() throws Exception {
@Test final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
public void testGenerationMetadataRecord() throws Exception {
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); final MetadataRecord record = GenerateNativeStoreSparkJob
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
"ns_prefix"), System.currentTimeMillis(), null, null);
MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null); assert record != null;
System.out.println(record.getId());
System.out.println(record.getOriginalId());
assert record != null; }
System.out.println(record.getId());
System.out.println(record.getOriginalId());
@Test
public void TestEquals() throws IOException {
} final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
final MetadataRecord record = GenerateNativeStoreSparkJob
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
"ns_prefix"), System.currentTimeMillis(), null, null);
final MetadataRecord record1 = GenerateNativeStoreSparkJob
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
"ns_prefix"), System.currentTimeMillis(), null, null);
assert record != null;
record.setBody("ciao");
assert record1 != null;
record1.setBody("mondo");
Assert.assertEquals(record, record1);
}
@Test
public void TestEquals () throws IOException {
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
MetadataRecord record1 = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
assert record != null;
record.setBody("ciao");
assert record1 != null;
record1.setBody("mondo");
Assert.assertEquals(record, record1);
}
} }

View File

@ -74,6 +74,6 @@ public class SparkCreateConnectedComponent {
} }
public static long getHashcode(final String id) { public static long getHashcode(final String id) {
return Hashing.murmur3_128().hashUnencodedChars(id).asLong(); return Hashing.murmur3_128().hashString(id).asLong();
} }
} }

View File

@ -18,13 +18,13 @@ public class GraphMappingUtils {
public final static Map<String, Class> types = Maps.newHashMap(); public final static Map<String, Class> types = Maps.newHashMap();
static { static {
types.put("datasource", Datasource.class); types.put("datasource", Datasource.class);
types.put("organization", Organization.class); types.put("organization", Organization.class);
types.put("project", Project.class); types.put("project", Project.class);
types.put("dataset", Dataset.class); types.put("dataset", Dataset.class);
types.put("otherresearchproduct", OtherResearchProduct.class); types.put("otherresearchproduct", OtherResearchProduct.class);
types.put("software", Software.class); types.put("software", Software.class);
types.put("publication", Publication.class); types.put("publication", Publication.class);
types.put("relation", Relation.class); types.put("relation", Relation.class);
} }

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.graph;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
@ -13,31 +13,40 @@ public class SparkGraphImporterJob {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream(
"/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
parser.parseArgument(args); parser.parseArgument(args);
final SparkSession spark = SparkSession
try(SparkSession spark = getSparkSession(parser)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
final String hiveDbName = parser.get("hive_db_name");
spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName));
spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
// Read the input file and convert it into RDD of serializable object
GraphMappingUtils.types.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name)
.map(s -> new ObjectMapper().readValue(s, clazz))
.rdd(), Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.saveAsTable(hiveDbName + "." + name));
}
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
return SparkSession
.builder() .builder()
.appName(SparkGraphImporterJob.class.getSimpleName()) .appName(SparkGraphImporterJob.class.getSimpleName())
.master(parser.get("master")) .master(parser.get("master"))
.config("hive.metastore.uris", parser.get("hive_metastore_uris")) .config(conf)
.enableHiveSupport() .enableHiveSupport()
.getOrCreate(); .getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
final String hiveDbName = parser.get("hive_db_name");
spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
// Read the input file and convert it into RDD of serializable object
GraphMappingUtils.types.forEach((name, clazz) -> {
spark.createDataset(sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class)
.map(s -> new ObjectMapper().readValue(s._2().toString(), clazz))
.rdd(), Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.saveAsTable(hiveDbName + "." + name);
});
} }
} }

View File

@ -0,0 +1,8 @@
CREATE view result as
select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.publication p
union all
select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.dataset d
union all
select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.software s
union all
select id, dateofcollection, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.otherresearchproduct o;

View File

@ -1,4 +1,5 @@
<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>sourcePath</name> <name>sourcePath</name>
@ -22,6 +23,21 @@
</property> </property>
</parameters> </parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>
<start to="MapGraphIntoDataFrame"/> <start to="MapGraphIntoDataFrame"/>
<kill name="Kill"> <kill name="Kill">
@ -30,19 +46,39 @@
<action name="MapGraphIntoDataFrame"> <action name="MapGraphIntoDataFrame">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <master>yarn</master>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>MapGraphIntoDataFrame</name> <name>MapGraphIntoDataFrame</name>
<class>eu.dnetlib.dhp.graph.SparkGraphImporterJob</class> <class>eu.dnetlib.dhp.graph.SparkGraphImporterJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts> <spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg> <arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_db_name</arg><arg>${hive_db_name}</arg> <arg>--hive_db_name</arg><arg>${hive_db_name}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
</spark> </spark>
<ok to="PostProcessing"/>
<error to="Kill"/>
</action>
<action name="PostProcessing">
<hive xmlns="uri:oozie:hive-action:0.2">
<configuration>
<property>
<name>oozie.hive.defaults</name>
<value>hive-site.xml</value>
</property>
</configuration>
<script>lib/scripts/postprocessing.sql</script>
<param>hive_db_name=${hive_db_name}</param>
</hive>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>

View File

@ -1,10 +1,12 @@
sparkDriverMemory=8G sparkDriverMemory=10G
sparkExecutorMemory=8G sparkExecutorMemory=15G
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp #isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03 sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
outputPath=/tmp/openaire_provision outputPath=/tmp/openaire_provision
format=TMF format=TMF
batchSize=2000 batchSize=2000
sparkExecutorCoresForJoining=128
sparkExecutorCoresForIndexing=64 sparkExecutorCoresForIndexing=64
reuseRecords=true reuseRecords=false
otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource

View File

@ -1,31 +1,32 @@
package eu.dnetlib.dhp.graph; package eu.dnetlib.dhp.graph;
import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import com.google.common.collect.Lists; import com.google.common.collect.Maps;
import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.graph.model.*; import eu.dnetlib.dhp.graph.model.*;
import eu.dnetlib.dhp.graph.utils.ContextMapper; import eu.dnetlib.dhp.graph.utils.ContextMapper;
import eu.dnetlib.dhp.graph.utils.GraphMappingUtils; import eu.dnetlib.dhp.graph.utils.GraphMappingUtils;
import eu.dnetlib.dhp.graph.utils.RelationPartitioner;
import eu.dnetlib.dhp.graph.utils.XmlRecordFactory; import eu.dnetlib.dhp.graph.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
import scala.Tuple2; import scala.Tuple2;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashSet; import java.util.Map;
import java.util.List;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity; import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity;
@ -45,10 +46,12 @@ import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity;
* 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S * 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S
* and E_target = T. Objects in T are heavily pruned by all the unnecessary information * and E_target = T. Objects in T are heavily pruned by all the unnecessary information
* *
* 4) perform the join as (((T join R) union S) groupby S.id) yield S -> [ <T, R> ] * 4) perform the join as (((T.id join R.target) union S) groupby S.id) yield S -> [ <T, R> ]
*/ */
public class GraphJoiner implements Serializable { public class GraphJoiner implements Serializable {
private Map<String, LongAccumulator> accumulators = Maps.newHashMap();
public static final int MAX_RELS = 100; public static final int MAX_RELS = 100;
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
@ -61,24 +64,30 @@ public class GraphJoiner implements Serializable {
private String outPath; private String outPath;
public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String inputPath, String outPath) { private String otherDsTypeId;
public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String otherDsTypeId, String inputPath, String outPath) {
this.spark = spark; this.spark = spark;
this.contextMapper = contextMapper; this.contextMapper = contextMapper;
this.otherDsTypeId = otherDsTypeId;
this.inputPath = inputPath; this.inputPath = inputPath;
this.outPath = outPath; this.outPath = outPath;
final SparkContext sc = spark.sparkContext();
prepareAccumulators(sc);
} }
public GraphJoiner adjacencyLists() { public GraphJoiner adjacencyLists() {
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext()); final JavaSparkContext jsc = new JavaSparkContext(getSpark().sparkContext());
// read each entity // read each entity
JavaPairRDD<String, TypedRow> datasource = readPathEntity(sc, getInputPath(), "datasource"); JavaPairRDD<String, TypedRow> datasource = readPathEntity(jsc, getInputPath(), "datasource");
JavaPairRDD<String, TypedRow> organization = readPathEntity(sc, getInputPath(), "organization"); JavaPairRDD<String, TypedRow> organization = readPathEntity(jsc, getInputPath(), "organization");
JavaPairRDD<String, TypedRow> project = readPathEntity(sc, getInputPath(), "project"); JavaPairRDD<String, TypedRow> project = readPathEntity(jsc, getInputPath(), "project");
JavaPairRDD<String, TypedRow> dataset = readPathEntity(sc, getInputPath(), "dataset"); JavaPairRDD<String, TypedRow> dataset = readPathEntity(jsc, getInputPath(), "dataset");
JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(sc, getInputPath(), "otherresearchproduct"); JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(jsc, getInputPath(), "otherresearchproduct");
JavaPairRDD<String, TypedRow> software = readPathEntity(sc, getInputPath(), "software"); JavaPairRDD<String, TypedRow> software = readPathEntity(jsc, getInputPath(), "software");
JavaPairRDD<String, TypedRow> publication = readPathEntity(sc, getInputPath(), "publication"); JavaPairRDD<String, TypedRow> publication = readPathEntity(jsc, getInputPath(), "publication");
// create the union between all the entities // create the union between all the entities
final String entitiesPath = getOutPath() + "/entities"; final String entitiesPath = getOutPath() + "/entities";
@ -93,31 +102,43 @@ public class GraphJoiner implements Serializable {
.map(GraphMappingUtils::serialize) .map(GraphMappingUtils::serialize)
.saveAsTextFile(entitiesPath, GzipCodec.class); .saveAsTextFile(entitiesPath, GzipCodec.class);
JavaPairRDD<String, EntityRelEntity> entities = sc.textFile(entitiesPath) JavaPairRDD<String, EntityRelEntity> entities = jsc.textFile(entitiesPath)
.map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
.mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t)); .mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t));
final String relationPath = getOutPath() + "/relation";
// reads the relationships // reads the relationships
final JavaPairRDD<String, EntityRelEntity> relation = readPathRelation(sc, getInputPath()) final JavaPairRDD<SortableRelationKey, EntityRelEntity> rels = readPathRelation(jsc, getInputPath())
.filter(r -> !r.getDeleted()) //only consider those that are not virtually deleted .filter(rel -> !rel.getDeleted()) //only consider those that are not virtually deleted
.map(p -> new EntityRelEntity().setRelation(p)) .map(p -> new EntityRelEntity().setRelation(p))
.mapToPair(p -> new Tuple2<>(p.getRelation().getSourceId(), p)) .mapToPair(p -> new Tuple2<>(SortableRelationKey.from(p), p));
.groupByKey() rels
.groupByKey(new RelationPartitioner(rels.getNumPartitions()))
.map(p -> Iterables.limit(p._2(), MAX_RELS)) .map(p -> Iterables.limit(p._2(), MAX_RELS))
.flatMap(p -> p.iterator()) .flatMap(p -> p.iterator())
.map(s -> new ObjectMapper().writeValueAsString(s))
.saveAsTextFile(relationPath, GzipCodec.class);
final JavaPairRDD<String, EntityRelEntity> relation = jsc.textFile(relationPath)
.map(s -> new ObjectMapper().readValue(s, EntityRelEntity.class))
.mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p)); .mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p));
//final String bySource = getOutPath() + "/1_join_by_target"; final String bySourcePath = getOutPath() + "/join_by_source";
JavaPairRDD<String, EntityRelEntity> bySource = relation relation
.join(entities .join(entities
.filter(e -> !e._2().getSource().getDeleted()) .filter(e -> !e._2().getSource().getDeleted())
.mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2())))) .mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2()))))
.map(s -> new EntityRelEntity() .map(s -> new EntityRelEntity()
.setRelation(s._2()._1().getRelation()) .setRelation(s._2()._1().getRelation())
.setTarget(s._2()._2().getSource())) .setTarget(s._2()._2().getSource()))
.map(j -> new ObjectMapper().writeValueAsString(j))
.saveAsTextFile(bySourcePath, GzipCodec.class);
JavaPairRDD<String, EntityRelEntity> bySource = jsc.textFile(bySourcePath)
.map(e -> getObjectMapper().readValue(e, EntityRelEntity.class))
.mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t)); .mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t));
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, false, schemaLocation, new HashSet<>()); final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
entities entities
.union(bySource) .union(bySource)
.groupByKey() // by source id .groupByKey() // by source id
@ -130,20 +151,6 @@ public class GraphJoiner implements Serializable {
return this; return this;
} }
public GraphJoiner asXML() {
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, true, "", new HashSet<>());
final ObjectMapper mapper = new ObjectMapper();
final String joinedEntitiesPath = getOutPath() + "/1_joined_entities";
sc.textFile(joinedEntitiesPath)
.map(s -> mapper.readValue(s, JoinedEntity.class))
.mapToPair(je -> new Tuple2<>(new Text(je.getEntity().getId()), new Text(recordFactory.build(je))))
.saveAsHadoopFile(getOutPath() + "/2_xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
return this;
}
public SparkSession getSpark() { public SparkSession getSpark() {
return spark; return spark;
} }
@ -158,24 +165,23 @@ public class GraphJoiner implements Serializable {
// HELPERS // HELPERS
private OafEntity parseOaf(final String json, final String type) { private OafEntity parseOaf(final String json, final String type, final ObjectMapper mapper) {
final ObjectMapper o = new ObjectMapper();
try { try {
switch (GraphMappingUtils.EntityType.valueOf(type)) { switch (GraphMappingUtils.EntityType.valueOf(type)) {
case publication: case publication:
return o.readValue(json, Publication.class); return mapper.readValue(json, Publication.class);
case dataset: case dataset:
return o.readValue(json, Dataset.class); return mapper.readValue(json, Dataset.class);
case otherresearchproduct: case otherresearchproduct:
return o.readValue(json, OtherResearchProduct.class); return mapper.readValue(json, OtherResearchProduct.class);
case software: case software:
return o.readValue(json, Software.class); return mapper.readValue(json, Software.class);
case datasource: case datasource:
return o.readValue(json, Datasource.class); return mapper.readValue(json, Datasource.class);
case organization: case organization:
return o.readValue(json, Organization.class); return mapper.readValue(json, Organization.class);
case project: case project:
return o.readValue(json, Project.class); return mapper.readValue(json, Project.class);
default: default:
throw new IllegalArgumentException("invalid type: " + type); throw new IllegalArgumentException("invalid type: " + type);
} }
@ -185,26 +191,26 @@ public class GraphJoiner implements Serializable {
} }
private JoinedEntity toJoinedEntity(Tuple2<String, Iterable<EntityRelEntity>> p) { private JoinedEntity toJoinedEntity(Tuple2<String, Iterable<EntityRelEntity>> p) {
final ObjectMapper o = new ObjectMapper(); final ObjectMapper mapper = getObjectMapper();
final JoinedEntity j = new JoinedEntity(); final JoinedEntity j = new JoinedEntity();
final Links links2 = new Links(); final Links links = new Links();
for(EntityRelEntity rel : p._2()) { for(EntityRelEntity rel : p._2()) {
if (rel.hasMainEntity() & j.getEntity() == null) { if (rel.hasMainEntity() & j.getEntity() == null) {
j.setType(rel.getSource().getType()); j.setType(rel.getSource().getType());
j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType())); j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType(), mapper));
} }
if (rel.hasRelatedEntity()) { if (rel.hasRelatedEntity()) {
try { try {
links2.add( links.add(
new eu.dnetlib.dhp.graph.model.Tuple2() new eu.dnetlib.dhp.graph.model.Tuple2()
.setRelation(o.readValue(rel.getRelation().getOaf(), Relation.class)) .setRelation(mapper.readValue(rel.getRelation().getOaf(), Relation.class))
.setRelatedEntity(o.readValue(rel.getTarget().getOaf(), RelatedEntity.class))); .setRelatedEntity(mapper.readValue(rel.getTarget().getOaf(), RelatedEntity.class)));
} catch (IOException e) { } catch (IOException e) {
throw new IllegalArgumentException(e); throw new IllegalArgumentException(e);
} }
} }
} }
j.setLinks(links2); j.setLinks(links);
if (j.getEntity() == null) { if (j.getEntity() == null) {
throw new IllegalStateException("missing main entity on '" + p._1() + "'"); throw new IllegalStateException("missing main entity on '" + p._1() + "'");
} }
@ -220,9 +226,8 @@ public class GraphJoiner implements Serializable {
* @return the JavaPairRDD<String, TypedRow> indexed by entity identifier * @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
*/ */
private JavaPairRDD<String, TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) { private JavaPairRDD<String, TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class) return sc.textFile(inputPath + "/" + type)
.mapToPair((PairFunction<Tuple2<Text, Text>, String, TypedRow>) item -> { .mapToPair((PairFunction<String, String, TypedRow>) s -> {
final String s = item._2().toString();
final DocumentContext json = JsonPath.parse(s); final DocumentContext json = JsonPath.parse(s);
final String id = json.read("$.id"); final String id = json.read("$.id");
return new Tuple2<>(id, new TypedRow() return new Tuple2<>(id, new TypedRow()
@ -241,17 +246,46 @@ public class GraphJoiner implements Serializable {
* @return the JavaRDD<TypedRow> containing all the relationships * @return the JavaRDD<TypedRow> containing all the relationships
*/ */
private JavaRDD<TypedRow> readPathRelation(final JavaSparkContext sc, final String inputPath) { private JavaRDD<TypedRow> readPathRelation(final JavaSparkContext sc, final String inputPath) {
return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class) return sc.textFile(inputPath + "/relation")
.map(item -> { .map(s -> {
final String s = item._2().toString();
final DocumentContext json = JsonPath.parse(s); final DocumentContext json = JsonPath.parse(s);
return new TypedRow() return new TypedRow()
.setSourceId(json.read("$.source")) .setSourceId(json.read("$.source"))
.setTargetId(json.read("$.target")) .setTargetId(json.read("$.target"))
.setDeleted(json.read("$.dataInfo.deletedbyinference")) .setDeleted(json.read("$.dataInfo.deletedbyinference"))
.setType("relation") .setType("relation")
.setRelType("$.relType")
.setSubRelType("$.subRelType")
.setRelClass("$.relClass")
.setOaf(s); .setOaf(s);
}); });
} }
private ObjectMapper getObjectMapper() {
return new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
}
private void prepareAccumulators(SparkContext sc) {
accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments"));
accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf"));
accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution"));
accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant"));
accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant"));
accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy"));
accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides"));
}
} }

View File

@ -24,6 +24,7 @@ public class SparkXmlRecordBuilderJob {
final String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
final String isLookupUrl = parser.get("isLookupUrl"); final String isLookupUrl = parser.get("isLookupUrl");
final String otherDsTypeId = parser.get("otherDsTypeId");
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
if (fs.exists(new Path(outputPath))) { if (fs.exists(new Path(outputPath))) {
@ -31,8 +32,9 @@ public class SparkXmlRecordBuilderJob {
fs.mkdirs(new Path(outputPath)); fs.mkdirs(new Path(outputPath));
} }
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), inputPath, outputPath) new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
.adjacencyLists(); .adjacencyLists();
//.asXML();
} }
} }

View File

@ -0,0 +1,99 @@
package eu.dnetlib.dhp.graph.model;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Maps;
import java.io.Serializable;
import java.util.Map;
/**
* Allows to sort relationships according to the priority defined in weights map.
*/
public class SortableRelationKey implements Comparable<SortableRelationKey>, Serializable {
private String sourceId;
private String targetId;
private String relType;
private String subRelType;
private String relClass;
private final static Map<String, Integer> weights = Maps.newHashMap();
static {
weights.put("outcome", 0);
weights.put("supplement", 1);
weights.put("publicationDataset", 2);
weights.put("relationship", 3);
weights.put("similarity", 4);
weights.put("affiliation", 5);
weights.put("provision", 6);
weights.put("participation", 7);
weights.put("dedup", 8);
}
public static SortableRelationKey from(final EntityRelEntity e) {
return new SortableRelationKey()
.setSourceId(e.getRelation().getSourceId())
.setTargetId(e.getRelation().getTargetId())
.setRelType(e.getRelation().getRelType())
.setSubRelType(e.getRelation().getSubRelType())
.setRelClass(e.getRelation().getRelClass());
}
public String getSourceId() {
return sourceId;
}
public SortableRelationKey setSourceId(String sourceId) {
this.sourceId = sourceId;
return this;
}
public String getTargetId() {
return targetId;
}
public SortableRelationKey setTargetId(String targetId) {
this.targetId = targetId;
return this;
}
public String getRelType() {
return relType;
}
public SortableRelationKey setRelType(String relType) {
this.relType = relType;
return this;
}
public String getSubRelType() {
return subRelType;
}
public SortableRelationKey setSubRelType(String subRelType) {
this.subRelType = subRelType;
return this;
}
public String getRelClass() {
return relClass;
}
public SortableRelationKey setRelClass(String relClass) {
this.relClass = relClass;
return this;
}
@Override
public int compareTo(SortableRelationKey o) {
return ComparisonChain.start()
.compare(weights.get(getSubRelType()), weights.get(o.getSubRelType()))
.compare(getSourceId(), o.getSourceId())
.compare(getTargetId(), o.getTargetId())
.result();
}
}

View File

@ -12,6 +12,10 @@ public class TypedRow implements Serializable {
private String type; private String type;
private String relType;
private String subRelType;
private String relClass;
private String oaf; private String oaf;
public String getSourceId() { public String getSourceId() {
@ -50,6 +54,33 @@ public class TypedRow implements Serializable {
return this; return this;
} }
public String getRelType() {
return relType;
}
public TypedRow setRelType(String relType) {
this.relType = relType;
return this;
}
public String getSubRelType() {
return subRelType;
}
public TypedRow setSubRelType(String subRelType) {
this.subRelType = subRelType;
return this;
}
public String getRelClass() {
return relClass;
}
public TypedRow setRelClass(String relClass) {
this.relClass = relClass;
return this;
}
public String getOaf() { public String getOaf() {
return oaf; return oaf;
} }

View File

@ -26,6 +26,8 @@ import static org.apache.commons.lang3.StringUtils.*;
public class GraphMappingUtils { public class GraphMappingUtils {
public static final String SEPARATOR = "_";
public enum EntityType { public enum EntityType {
publication, dataset, otherresearchproduct, software, datasource, organization, project publication, dataset, otherresearchproduct, software, datasource, organization, project
} }
@ -38,34 +40,6 @@ public class GraphMappingUtils {
public static Set<String> instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation"); public static Set<String> instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation");
private static BiMap<String, String> relClassMapping = HashBiMap.create();
static {
relClassMapping.put("isAuthorInstitutionOf", "hasAuthorInstitution");
relClassMapping.put("isMergedIn", "merges");
relClassMapping.put("isProducedBy", "produces");
relClassMapping.put("hasParticipant", "isParticipant");
relClassMapping.put("isProvidedBy", "provides");
relClassMapping.put("isRelatedTo", "isRelatedTo");
relClassMapping.put("isAmongTopNSimilarDocuments", "hasAmongTopNSimilarDocuments");
relClassMapping.put("isRelatedTo", "isRelatedTo");
relClassMapping.put("isSupplementTo", "isSupplementedBy");
}
public static String getInverseRelClass(final String relClass) {
String res = relClassMapping.get(relClass);
if (isNotBlank(res)) {
return res;
}
res = relClassMapping.inverse().get(relClass);
if (isNotBlank(res)) {
return res;
}
throw new IllegalArgumentException("unable to find an inverse relationship class for term: " + relClass);
}
private static final String schemeTemplate = "dnet:%s_%s_relations"; private static final String schemeTemplate = "dnet:%s_%s_relations";
private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap(); private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
@ -158,7 +132,7 @@ public class GraphMappingUtils {
re.setLegalname(j.read("$.legalname.value")); re.setLegalname(j.read("$.legalname.value"));
re.setLegalshortname(j.read("$.legalshortname.value")); re.setLegalshortname(j.read("$.legalshortname.value"));
re.setCountry(asQualifier(j.read("$.country"))); re.setCountry(asQualifier(j.read("$.country")));
re.setWebsiteurl(j.read("$.websiteurl.value"));
break; break;
case project: case project:
re.setProjectTitle(j.read("$.title.value")); re.setProjectTitle(j.read("$.title.value"));
@ -250,5 +224,8 @@ public class GraphMappingUtils {
return s; return s;
} }
public static String getRelDescriptor(String relType, String subRelType, String relClass) {
return relType + SEPARATOR + subRelType + SEPARATOR + relClass;
}
} }

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.graph.utils;
import eu.dnetlib.dhp.graph.model.SortableRelationKey;
import org.apache.spark.Partitioner;
import org.apache.spark.util.Utils;
/**
* Used in combination with SortableRelationKey, allows to partition the records by source id, therefore
* allowing to sort relations sharing the same source id by the ordering defined in SortableRelationKey.
*/
public class RelationPartitioner extends Partitioner {
private int numPartitions;
public RelationPartitioner(int numPartitions) {
this.numPartitions = numPartitions;
}
@Override
public int numPartitions() {
return numPartitions;
}
@Override
public int getPartition(Object key) {
return Utils.nonNegativeMod(((SortableRelationKey) key).getSourceId().hashCode(), numPartitions());
}
}

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dhp.graph.utils;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import com.mycila.xmltool.XMLDoc; import com.mycila.xmltool.XMLDoc;
import com.mycila.xmltool.XMLTag; import com.mycila.xmltool.XMLTag;
@ -11,6 +12,8 @@ import eu.dnetlib.dhp.graph.model.RelatedEntity;
import eu.dnetlib.dhp.graph.model.Tuple2; import eu.dnetlib.dhp.graph.model.Tuple2;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.dom4j.Element; import org.dom4j.Element;
@ -27,6 +30,7 @@ import java.io.Serializable;
import java.io.StringReader; import java.io.StringReader;
import java.io.StringWriter; import java.io.StringWriter;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -37,6 +41,8 @@ import static org.apache.commons.lang3.StringUtils.substringBefore;
public class XmlRecordFactory implements Serializable { public class XmlRecordFactory implements Serializable {
private Map<String, LongAccumulator> accumulators;
private Set<String> specialDatasourceTypes; private Set<String> specialDatasourceTypes;
private ContextMapper contextMapper; private ContextMapper contextMapper;
@ -47,11 +53,20 @@ public class XmlRecordFactory implements Serializable {
public XmlRecordFactory( public XmlRecordFactory(
final ContextMapper contextMapper, final boolean indent, final ContextMapper contextMapper, final boolean indent,
final String schemaLocation, final Set<String> otherDatasourceTypesUForUI) { final String schemaLocation, final String otherDatasourceTypesUForUI) {
this(Maps.newHashMap(), contextMapper, indent, schemaLocation, otherDatasourceTypesUForUI);
}
public XmlRecordFactory(
final Map<String, LongAccumulator> accumulators,
final ContextMapper contextMapper, final boolean indent,
final String schemaLocation, final String otherDatasourceTypesUForUI) {
this.accumulators = accumulators;
this.contextMapper = contextMapper; this.contextMapper = contextMapper;
this.schemaLocation = schemaLocation; this.schemaLocation = schemaLocation;
this.specialDatasourceTypes = otherDatasourceTypesUForUI; this.specialDatasourceTypes = Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI));
this.indent = indent; this.indent = indent;
} }
@ -448,7 +463,7 @@ public class XmlRecordFactory implements Serializable {
if (ds.getSubjects() != null) { if (ds.getSubjects() != null) {
metadata.addAll(ds.getSubjects() metadata.addAll(ds.getSubjects()
.stream() .stream()
.map(sp -> mapStructuredProperty("subject", sp)) .map(sp -> mapStructuredProperty("subjects", sp))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
@ -583,7 +598,7 @@ public class XmlRecordFactory implements Serializable {
if (p.getFundingtree() != null) { if (p.getFundingtree() != null) {
metadata.addAll(p.getFundingtree() metadata.addAll(p.getFundingtree()
.stream() .stream()
.map(ft -> asXmlElement("fundingtree", ft.getValue())) .map(ft -> ft.getValue())
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
@ -712,13 +727,23 @@ public class XmlRecordFactory implements Serializable {
} }
final DataInfo info = rel.getDataInfo(); final DataInfo info = rel.getDataInfo();
final String scheme = getScheme(re.getType(), targetType);
if (StringUtils.isBlank(scheme)) {
throw new IllegalArgumentException(String.format("missing scheme for: <%s - %s>", re.getType(), targetType));
}
final String accumulatorName = getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass());
if (accumulators.containsKey(accumulatorName)) {
accumulators.get(accumulatorName).add(1);
}
rels.add(templateFactory.getRel( rels.add(templateFactory.getRel(
targetType, targetType,
rel.getTarget(), rel.getTarget(),
Sets.newHashSet(metadata), Sets.newHashSet(metadata),
getInverseRelClass(rel.getRelClass()), rel.getRelClass(),
getScheme(targetType, re.getType()), scheme,
info)); info));
} }
return rels; return rels;
@ -931,7 +956,7 @@ public class XmlRecordFactory implements Serializable {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
private String getRelFundingTree(final String xmlTree) { protected static String getRelFundingTree(final String xmlTree) {
String funding = "<funding>"; String funding = "<funding>";
try { try {
final Document ftree = new SAXReader().read(new StringReader(xmlTree)); final Document ftree = new SAXReader().read(new StringReader(xmlTree));
@ -952,11 +977,11 @@ public class XmlRecordFactory implements Serializable {
return funding; return funding;
} }
private String getFunderElement(final Document ftree) { private static String getFunderElement(final Document ftree) {
final String funderId = ftree.valueOf("//fundingtree/funder/id/text()"); final String funderId = ftree.valueOf("//fundingtree/funder/id");
final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname/text()"); final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname");
final String funderName = ftree.valueOf("//fundingtree/funder/name/text()"); final String funderName = ftree.valueOf("//fundingtree/funder/name");
final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction/text()"); final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction");
return "<funder id=\"" + escapeXml(funderId) + "\" shortname=\"" + escapeXml(funderShortName) + "\" name=\"" + escapeXml(funderName) return "<funder id=\"" + escapeXml(funderId) + "\" shortname=\"" + escapeXml(funderShortName) + "\" name=\"" + escapeXml(funderName)
+ "\" jurisdiction=\"" + escapeXml(funderJurisdiction) + "\" />"; + "\" jurisdiction=\"" + escapeXml(funderJurisdiction) + "\" />";

View File

@ -1,6 +1,7 @@
[ [
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true}, {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true} {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true},
{"paramName":"t", "paramLongName":"otherDsTypeId", "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true}
] ]

View File

@ -1,4 +1,5 @@
<workflow-app name="index_infospace_graph" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="index_infospace_graph" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>hive_db_name</name> <name>hive_db_name</name>
@ -26,6 +27,21 @@
</property> </property>
</parameters> </parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>
<start to="reuse_records"/> <start to="reuse_records"/>
<decision name="reuse_records"> <decision name="reuse_records">
@ -42,17 +58,16 @@
<action name="adjancency_lists"> <action name="adjancency_lists">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>build_adjacency_lists</name> <name>build_adjacency_lists</name>
<class>eu.dnetlib.dhp.graph.SparkXmlRecordBuilderJob</class> <class>eu.dnetlib.dhp.graph.SparkXmlRecordBuilderJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar> <jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCoresForJoining}
--executor-cores ${sparkExecutorCores} --executor-memory ${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemoryForJoining}
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForJoining}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -60,6 +75,7 @@
</spark-opts> </spark-opts>
<arg>-mt</arg> <arg>yarn</arg> <arg>-mt</arg> <arg>yarn</arg>
<arg>-is</arg> <arg>${isLookupUrl}</arg> <arg>-is</arg> <arg>${isLookupUrl}</arg>
<arg>-t</arg> <arg>${otherDsTypeId}</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg> <arg>--outputPath</arg><arg>${outputPath}</arg>
</spark> </spark>
@ -69,16 +85,15 @@
<action name="to_solr_index"> <action name="to_solr_index">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>to_solr_index</name> <name>to_solr_index</name>
<class>eu.dnetlib.dhp.graph.SparkXmlIndexingJob</class> <class>eu.dnetlib.dhp.graph.SparkXmlIndexingJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar> <jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCoresForIndexing}
--driver-memory=${sparkDriverMemory} --executor-memory ${sparkExecutorMemoryForIndexing}
--driver-memory=${sparkDriverMemoryForIndexing}
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing} --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.graph;
import org.junit.Before;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class GraphJoinerTest {
private ClassLoader cl = getClass().getClassLoader();
private Path workingDir;
private Path inputDir;
private Path outputDir;
@Before
public void before() throws IOException {
workingDir = Files.createTempDirectory("promote_action_set");
inputDir = workingDir.resolve("input");
outputDir = workingDir.resolve("output");
}
private static void copyFiles(Path source, Path target) throws IOException {
Files.list(source).forEach(f -> {
try {
if (Files.isDirectory(f)) {
Path subTarget = Files.createDirectories(target.resolve(f.getFileName()));
copyFiles(f, subTarget);
} else {
Files.copy(f, target.resolve(f.getFileName()));
}
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
});
}
}

24
pom.xml
View File

@ -110,6 +110,12 @@
<version>${dhp.hadoop.version}</version> <version>${dhp.hadoop.version}</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-distcp</artifactId>
<version>${dhp.hadoop.version}</version>
<scope>provided</scope>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_2.11</artifactId>
@ -142,6 +148,13 @@
<version>${dhp.commons.lang.version}</version> <version>${dhp.commons.lang.version}</version>
</dependency> </dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${dhp.guava.version}</version>
</dependency>
<dependency> <dependency>
<groupId>commons-codec</groupId> <groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId> <artifactId>commons-codec</artifactId>
@ -262,6 +275,16 @@
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<version>6.0.5</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-data-protos</artifactId>
<version>3.9.8-proto250</version>
</dependency>
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId> <artifactId>dnet-pace-core</artifactId>
@ -480,6 +503,7 @@
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version> <dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dhp.jackson.version>2.9.6</dhp.jackson.version> <dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version> <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.guava.version>11.0.2</dhp.guava.version>
<scala.version>2.11.12</scala.version> <scala.version>2.11.12</scala.version>
<junit.version>4.12</junit.version> <junit.version>4.12</junit.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>