forked from antonis.lempesis/dnet-hadoop
merged
This commit is contained in:
commit
ca722d4d18
|
@ -13,6 +13,7 @@ public class ModelConstants {
|
||||||
public static final String DNET_DATA_CITE_DATE = "dnet:dataCite_date";
|
public static final String DNET_DATA_CITE_DATE = "dnet:dataCite_date";
|
||||||
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
|
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
|
||||||
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
|
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
|
||||||
|
public static final String DNET_COUNTRY_TYPE = "dnet:countries";
|
||||||
|
|
||||||
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
|
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
|
||||||
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";
|
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";
|
||||||
|
@ -49,6 +50,13 @@ public class ModelConstants {
|
||||||
public static final String HAS_PARTICIPANT = "hasParticipant";
|
public static final String HAS_PARTICIPANT = "hasParticipant";
|
||||||
public static final String IS_PARTICIPANT = "isParticipant";
|
public static final String IS_PARTICIPANT = "isParticipant";
|
||||||
|
|
||||||
|
public static final String RESULT_ORGANIZATION = "resultOrganization";
|
||||||
|
public static final String AFFILIATION = "affiliation";
|
||||||
|
public static final String IS_AUTHOR_INSTITUTION_OF = "isAuthorInstitutionOf";
|
||||||
|
public static final String HAS_AUTHOR_INSTITUTION = "hasAuthorInstitution";
|
||||||
|
|
||||||
|
public static final String MERGES = "merges";
|
||||||
|
|
||||||
public static final String UNKNOWN = "UNKNOWN";
|
public static final String UNKNOWN = "UNKNOWN";
|
||||||
public static final String NOT_AVAILABLE = "not available";
|
public static final String NOT_AVAILABLE = "not available";
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,15 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.common;
|
package eu.dnetlib.dhp.schema.common;
|
||||||
|
|
||||||
|
import static com.google.common.base.Preconditions.checkArgument;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
@ -379,6 +384,21 @@ public class ModelSupport {
|
||||||
entityMapping.get(EntityType.valueOf(targetType)).name());
|
entityMapping.get(EntityType.valueOf(targetType)).name());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> String tableIdentifier(String dbName, String tableName) {
|
||||||
|
|
||||||
|
checkArgument(StringUtils.isNotBlank(dbName), "DB name cannot be empty");
|
||||||
|
checkArgument(StringUtils.isNotBlank(tableName), "table name cannot be empty");
|
||||||
|
|
||||||
|
return String.format("%s.%s", dbName, tableName);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> String tableIdentifier(String dbName, Class<T> clazz) {
|
||||||
|
|
||||||
|
checkArgument(Objects.nonNull(clazz), "clazz is needed to derive the table name, thus cannot be null");
|
||||||
|
|
||||||
|
return tableIdentifier(dbName, clazz.getSimpleName().toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> Function<T, String> idFn() {
|
public static <T extends Oaf> Function<T, String> idFn() {
|
||||||
return x -> {
|
return x -> {
|
||||||
if (isSubClass(x, Relation.class)) {
|
if (isSubClass(x, Relation.class)) {
|
||||||
|
|
|
@ -523,7 +523,9 @@ public class ProtoConverter implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Context mapContext(ResultProtos.Result.Context context) {
|
private static Context mapContext(ResultProtos.Result.Context context) {
|
||||||
|
if (context == null || StringUtils.isBlank(context.getId())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
final Context entity = new Context();
|
final Context entity = new Context();
|
||||||
entity.setId(context.getId());
|
entity.setId(context.getId());
|
||||||
entity
|
entity
|
||||||
|
@ -537,6 +539,10 @@ public class ProtoConverter implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
|
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
|
||||||
|
if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final KeyValue keyValue = new KeyValue();
|
final KeyValue keyValue = new KeyValue();
|
||||||
keyValue.setKey(kv.getKey());
|
keyValue.setKey(kv.getKey());
|
||||||
keyValue.setValue(kv.getValue());
|
keyValue.setValue(kv.getValue());
|
||||||
|
@ -575,6 +581,10 @@ public class ProtoConverter implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
|
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
|
||||||
|
if (sp == null | StringUtils.isBlank(sp.getValue())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final StructuredProperty structuredProperty = new StructuredProperty();
|
final StructuredProperty structuredProperty = new StructuredProperty();
|
||||||
structuredProperty.setValue(sp.getValue());
|
structuredProperty.setValue(sp.getValue());
|
||||||
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
|
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
|
||||||
|
@ -611,6 +621,10 @@ public class ProtoConverter implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
|
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
|
||||||
|
if (s == null || StringUtils.isBlank(s.getValue())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final Field<String> stringField = new Field<>();
|
final Field<String> stringField = new Field<>();
|
||||||
stringField.setValue(s.getValue());
|
stringField.setValue(s.getValue());
|
||||||
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
|
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
|
||||||
|
@ -618,19 +632,16 @@ public class ProtoConverter implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
|
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
|
||||||
|
if (b == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final Field<Boolean> booleanField = new Field<>();
|
final Field<Boolean> booleanField = new Field<>();
|
||||||
booleanField.setValue(b.getValue());
|
booleanField.setValue(b.getValue());
|
||||||
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
|
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
|
||||||
return booleanField;
|
return booleanField;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Field<Integer> mapIntField(FieldTypeProtos.IntField b) {
|
|
||||||
final Field<Integer> entity = new Field<>();
|
|
||||||
entity.setValue(b.getValue());
|
|
||||||
entity.setDataInfo(mapDataInfo(b.getDataInfo()));
|
|
||||||
return entity;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Journal mapJournal(FieldTypeProtos.Journal j) {
|
public static Journal mapJournal(FieldTypeProtos.Journal j) {
|
||||||
final Journal journal = new Journal();
|
final Journal journal = new Journal();
|
||||||
journal.setConferencedate(j.getConferencedate());
|
journal.setConferencedate(j.getConferencedate());
|
||||||
|
|
|
@ -18,6 +18,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
public class PrepareMergedRelationJob {
|
public class PrepareMergedRelationJob {
|
||||||
|
@ -56,6 +57,7 @@ public class PrepareMergedRelationJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
selectMergesRelations(
|
selectMergesRelations(
|
||||||
spark,
|
spark,
|
||||||
inputPath,
|
inputPath,
|
||||||
|
@ -84,4 +86,9 @@ public class PrepareMergedRelationJob {
|
||||||
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
|
(MapFunction<String, Relation>) value -> OBJECT_MAPPER.readValue(value, Relation.class),
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void removeOutputDir(SparkSession spark, String path) {
|
||||||
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
@ -62,6 +63,7 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
removeBlacklistedRelations(
|
removeBlacklistedRelations(
|
||||||
spark,
|
spark,
|
||||||
blacklistPath,
|
blacklistPath,
|
||||||
|
@ -69,7 +71,6 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
outputPath,
|
outputPath,
|
||||||
mergesPath);
|
mergesPath);
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
|
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
|
||||||
|
@ -78,8 +79,6 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
||||||
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
||||||
|
|
||||||
log.info("InputRelationCount: {}", inputRelation.count());
|
|
||||||
|
|
||||||
Dataset<Relation> dedupSource = blackListed
|
Dataset<Relation> dedupSource = blackListed
|
||||||
.joinWith(
|
.joinWith(
|
||||||
mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")),
|
mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")),
|
||||||
|
@ -102,11 +101,6 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
return c._1();
|
return c._1();
|
||||||
}, Encoders.bean(Relation.class));
|
}, Encoders.bean(Relation.class));
|
||||||
|
|
||||||
dedupBL
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.json(blacklistPath + "/deduped");
|
|
||||||
|
|
||||||
inputRelation
|
inputRelation
|
||||||
.joinWith(
|
.joinWith(
|
||||||
dedupBL, (inputRelation
|
dedupBL, (inputRelation
|
||||||
|
@ -144,4 +138,8 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void removeOutputDir(SparkSession spark, String path) {
|
||||||
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,25 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>mapreduce.job.queuename</name>
|
||||||
|
<value>${queueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
|
<value>${oozieLauncherQueueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -49,8 +68,6 @@
|
||||||
|
|
||||||
<action name="copy_publication">
|
<action name="copy_publication">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/publication</arg>
|
<arg>${nameNode}/${sourcePath}/publication</arg>
|
||||||
<arg>${nameNode}/${outputPath}/publication</arg>
|
<arg>${nameNode}/${outputPath}/publication</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -60,8 +77,6 @@
|
||||||
|
|
||||||
<action name="copy_dataset">
|
<action name="copy_dataset">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
||||||
<arg>${nameNode}/${outputPath}/dataset</arg>
|
<arg>${nameNode}/${outputPath}/dataset</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -71,8 +86,6 @@
|
||||||
|
|
||||||
<action name="copy_orp">
|
<action name="copy_orp">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -82,8 +95,6 @@
|
||||||
|
|
||||||
<action name="copy_software">
|
<action name="copy_software">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/software</arg>
|
<arg>${nameNode}/${sourcePath}/software</arg>
|
||||||
<arg>${nameNode}/${outputPath}/software</arg>
|
<arg>${nameNode}/${outputPath}/software</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -93,8 +104,6 @@
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -104,8 +113,6 @@
|
||||||
|
|
||||||
<action name="copy_project">
|
<action name="copy_project">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -115,8 +122,6 @@
|
||||||
|
|
||||||
<action name="copy_datasource">
|
<action name="copy_datasource">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -128,8 +133,6 @@
|
||||||
|
|
||||||
<action name="read_blacklist">
|
<action name="read_blacklist">
|
||||||
<java>
|
<java>
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB</main-class>
|
<main-class>eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB</main-class>
|
||||||
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
|
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
|
@ -156,6 +159,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/mergesRelation</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/mergesRelation</arg>
|
||||||
|
@ -180,6 +184,7 @@
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
|
|
|
@ -29,31 +29,32 @@ public class EventFactory {
|
||||||
"yyyy-MM-dd"
|
"yyyy-MM-dd"
|
||||||
};
|
};
|
||||||
|
|
||||||
public static Event newBrokerEvent(final Result source, final Result target, final UpdateInfo<?> updateInfo) {
|
public static Event newBrokerEvent(final UpdateInfo<?> updateInfo) {
|
||||||
|
|
||||||
final long now = new Date().getTime();
|
final long now = new Date().getTime();
|
||||||
|
|
||||||
final Event res = new Event();
|
final Event res = new Event();
|
||||||
|
|
||||||
final Map<String, Object> map = createMapFromResult(target, source, updateInfo);
|
final Map<String, Object> map = createMapFromResult(updateInfo);
|
||||||
|
|
||||||
final String payload = createPayload(target, updateInfo);
|
final String payload = createPayload(updateInfo);
|
||||||
|
|
||||||
final String eventId = calculateEventId(
|
final String eventId = calculateEventId(
|
||||||
updateInfo.getTopic(), target.getOriginalId().get(0), updateInfo.getHighlightValueAsString());
|
updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0),
|
||||||
|
updateInfo.getHighlightValueAsString());
|
||||||
|
|
||||||
res.setEventId(eventId);
|
res.setEventId(eventId);
|
||||||
res.setProducerId(PRODUCER_ID);
|
res.setProducerId(PRODUCER_ID);
|
||||||
res.setPayload(payload);
|
res.setPayload(payload);
|
||||||
res.setMap(map);
|
res.setMap(map);
|
||||||
res.setTopic(updateInfo.getTopic());
|
res.setTopic(updateInfo.getTopicPath());
|
||||||
res.setCreationDate(now);
|
res.setCreationDate(now);
|
||||||
res.setExpiryDate(calculateExpiryDate(now));
|
res.setExpiryDate(calculateExpiryDate(now));
|
||||||
res.setInstantMessage(false);
|
res.setInstantMessage(false);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String createPayload(final Result result, final UpdateInfo<?> updateInfo) {
|
private static String createPayload(final UpdateInfo<?> updateInfo) {
|
||||||
final OpenAireEventPayload payload = new OpenAireEventPayload();
|
final OpenAireEventPayload payload = new OpenAireEventPayload();
|
||||||
// TODO
|
// TODO
|
||||||
|
|
||||||
|
@ -62,32 +63,34 @@ public class EventFactory {
|
||||||
return payload.toJSON();
|
return payload.toJSON();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Map<String, Object> createMapFromResult(final Result oaf, final Result source,
|
private static Map<String, Object> createMapFromResult(final UpdateInfo<?> updateInfo) {
|
||||||
final UpdateInfo<?> updateInfo) {
|
|
||||||
final Map<String, Object> map = new HashMap<>();
|
final Map<String, Object> map = new HashMap<>();
|
||||||
|
|
||||||
final List<KeyValue> collectedFrom = oaf.getCollectedfrom();
|
final Result source = updateInfo.getSource();
|
||||||
|
final Result target = updateInfo.getTarget();
|
||||||
|
|
||||||
|
final List<KeyValue> collectedFrom = target.getCollectedfrom();
|
||||||
if (collectedFrom.size() == 1) {
|
if (collectedFrom.size() == 1) {
|
||||||
map.put("target_datasource_id", collectedFrom.get(0).getKey());
|
map.put("target_datasource_id", collectedFrom.get(0).getKey());
|
||||||
map.put("target_datasource_name", collectedFrom.get(0).getValue());
|
map.put("target_datasource_name", collectedFrom.get(0).getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<String> ids = oaf.getOriginalId();
|
final List<String> ids = target.getOriginalId();
|
||||||
if (ids.size() > 0) {
|
if (ids.size() > 0) {
|
||||||
map.put("target_publication_id", ids.get(0));
|
map.put("target_publication_id", ids.get(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<StructuredProperty> titles = oaf.getTitle();
|
final List<StructuredProperty> titles = target.getTitle();
|
||||||
if (titles.size() > 0) {
|
if (titles.size() > 0) {
|
||||||
map.put("target_publication_title", titles.get(0));
|
map.put("target_publication_title", titles.get(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
final long date = parseDateTolong(oaf.getDateofacceptance().getValue());
|
final long date = parseDateTolong(target.getDateofacceptance().getValue());
|
||||||
if (date > 0) {
|
if (date > 0) {
|
||||||
map.put("target_dateofacceptance", date);
|
map.put("target_dateofacceptance", date);
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<StructuredProperty> subjects = oaf.getSubject();
|
final List<StructuredProperty> subjects = target.getSubject();
|
||||||
if (subjects.size() > 0) {
|
if (subjects.size() > 0) {
|
||||||
map
|
map
|
||||||
.put(
|
.put(
|
||||||
|
@ -95,7 +98,7 @@ public class EventFactory {
|
||||||
subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
|
subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<Author> authors = oaf.getAuthor();
|
final List<Author> authors = target.getAuthor();
|
||||||
if (authors.size() > 0) {
|
if (authors.size() > 0) {
|
||||||
map
|
map
|
||||||
.put(
|
.put(
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.model;
|
||||||
|
|
||||||
|
public enum Topic {
|
||||||
|
|
||||||
|
// ENRICHMENT MISSING
|
||||||
|
ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), ENRICH_MISSING_ABSTRACT(
|
||||||
|
"ENRICH/MISSING/ABSTRACT"), ENRICH_MISSING_PUBLICATION_DATE(
|
||||||
|
"ENRICH/MISSING/PUBLICATION_DATE"), ENRICH_MISSING_PID(
|
||||||
|
"ENRICH/MISSING/PID"), ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), ENRICH_MISSING_SOFTWARE(
|
||||||
|
"ENRICH/MISSING/SOFTWARE"), ENRICH_MISSING_SUBJECT_MESHEUROPMC(
|
||||||
|
"ENRICH/MISSING/SUBJECT/MESHEUROPMC"), ENRICH_MISSING_SUBJECT_ARXIV(
|
||||||
|
"ENRICH/MISSING/SUBJECT/ARXIV"), ENRICH_MISSING_SUBJECT_JEL(
|
||||||
|
"ENRICH/MISSING/SUBJECT/JEL"), ENRICH_MISSING_SUBJECT_DDC(
|
||||||
|
"ENRICH/MISSING/SUBJECT/DDC"), ENRICH_MISSING_SUBJECT_ACM(
|
||||||
|
"ENRICH/MISSING/SUBJECT/ACM"), ENRICH_MISSING_SUBJECT_RVK(
|
||||||
|
"ENRICH/MISSING/SUBJECT/RVK"), ENRICH_MISSING_AUTHOR_ORCID(
|
||||||
|
"ENRICH/MISSING/AUTHOR/ORCID"),
|
||||||
|
|
||||||
|
// ENRICHMENT MORE
|
||||||
|
ENRICH_MORE_PID("ENRICH/MORE/PID"), ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), ENRICH_MORE_ABSTRACT(
|
||||||
|
"ENRICH/MORE/ABSTRACT"), ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), ENRICH_MORE_PROJECT(
|
||||||
|
"ENRICH/MORE/PROJECT"), ENRICH_MORE_SUBJECT_MESHEUROPMC(
|
||||||
|
"ENRICH/MORE/SUBJECT/MESHEUROPMC"), ENRICH_MORE_SUBJECT_ARXIV(
|
||||||
|
"ENRICH/MORE/SUBJECT/ARXIV"), ENRICH_MORE_SUBJECT_JEL(
|
||||||
|
"ENRICH/MORE/SUBJECT/JEL"), ENRICH_MORE_SUBJECT_DDC(
|
||||||
|
"ENRICH/MORE/SUBJECT/DDC"), ENRICH_MORE_SUBJECT_ACM(
|
||||||
|
"ENRICH/MORE/SUBJECT/ACM"), ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"),
|
||||||
|
|
||||||
|
// ADDITION
|
||||||
|
ADD_BY_PROJECT("ADD/BY_PROJECT");
|
||||||
|
|
||||||
|
Topic(final String path) {
|
||||||
|
this.path = path;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String path;
|
||||||
|
|
||||||
|
public String getPath() {
|
||||||
|
return this.path;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Topic fromPath(final String path) {
|
||||||
|
for (final Topic t : Topic.values()) {
|
||||||
|
if (t.getPath().equals(path)) {
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -14,21 +14,20 @@ import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.broker.model.Event;
|
import eu.dnetlib.dhp.broker.model.Event;
|
||||||
import eu.dnetlib.dhp.broker.model.EventFactory;
|
import eu.dnetlib.dhp.broker.model.EventFactory;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAbstract;
|
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAbstract;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAuthorOrcid;
|
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAuthorOrcid;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingOpenAccess;
|
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingOpenAccess;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPid;
|
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPid;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingProject;
|
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingProject;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPublicationDate;
|
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationDate;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMissingSubject;
|
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSubject;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMoreOpenAccess;
|
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreOpenAccess;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMorePid;
|
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMorePid;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EnrichMoreSubject;
|
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSubject;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
@ -37,7 +36,16 @@ public class GenerateEventsApplication {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
|
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final UpdateMatcher<?> enrichMissingAbstract = new EnrichMissingAbstract();
|
||||||
|
private static final UpdateMatcher<?> enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid();
|
||||||
|
private static final UpdateMatcher<?> enrichMissingOpenAccess = new EnrichMissingOpenAccess();
|
||||||
|
private static final UpdateMatcher<?> enrichMissingPid = new EnrichMissingPid();
|
||||||
|
private static final UpdateMatcher<?> enrichMissingProject = new EnrichMissingProject();
|
||||||
|
private static final UpdateMatcher<?> enrichMissingPublicationDate = new EnrichMissingPublicationDate();
|
||||||
|
private static final UpdateMatcher<?> enrichMissingSubject = new EnrichMissingSubject();
|
||||||
|
private static final UpdateMatcher<?> enrichMoreOpenAccess = new EnrichMoreOpenAccess();
|
||||||
|
private static final UpdateMatcher<?> enrichMorePid = new EnrichMorePid();
|
||||||
|
private static final UpdateMatcher<?> enrichMoreSubject = new EnrichMoreSubject();
|
||||||
|
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -76,37 +84,22 @@ public class GenerateEventsApplication {
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Event> generateEvents(final Result... children) {
|
private List<Event> generateEvents(final Result... children) {
|
||||||
final List<Event> list = new ArrayList<>();
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
|
||||||
for (final Result source : children) {
|
for (final Result target : children) {
|
||||||
for (final Result target : children) {
|
list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, children));
|
||||||
if (source != target) {
|
list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children));
|
||||||
list
|
list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children));
|
||||||
.addAll(
|
list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children));
|
||||||
findUpdates(source, target)
|
list.addAll(enrichMissingProject.searchUpdatesForRecord(target, children));
|
||||||
.stream()
|
list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children));
|
||||||
.map(info -> EventFactory.newBrokerEvent(source, target, info))
|
list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children));
|
||||||
.collect(Collectors.toList()));
|
list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children));
|
||||||
}
|
list.addAll(enrichMorePid.searchUpdatesForRecord(target, children));
|
||||||
}
|
list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, children));
|
||||||
}
|
}
|
||||||
|
|
||||||
return list;
|
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||||
}
|
|
||||||
|
|
||||||
private List<UpdateInfo<?>> findUpdates(final Result source, final Result target) {
|
|
||||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
|
||||||
list.addAll(EnrichMissingAbstract.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMissingAuthorOrcid.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMissingOpenAccess.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMissingPid.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMissingProject.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMissingPublicationDate.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMissingSubject.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMoreOpenAccess.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMorePid.findUpdates(source, target));
|
|
||||||
list.addAll(EnrichMoreSubject.findUpdates(source, target));
|
|
||||||
return list;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingAbstract extends UpdateMatcher<String> {
|
||||||
|
|
||||||
|
public EnrichMissingAbstract() {
|
||||||
|
super(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) {
|
||||||
|
if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) {
|
||||||
|
return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target));
|
||||||
|
}
|
||||||
|
return new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<String> generateUpdateInfo(final String highlightValue, final Result source,
|
||||||
|
final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_ABSTRACT,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, s) -> p.getAbstracts().add(s),
|
||||||
|
s -> s);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,34 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Pair<String, String>> {
|
||||||
|
|
||||||
|
public EnrichMissingAuthorOrcid() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||||
|
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||||
|
final Result source, final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_AUTHOR_ORCID,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()),
|
||||||
|
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Instance;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingOpenAccess extends UpdateMatcher<Instance> {
|
||||||
|
|
||||||
|
public EnrichMissingOpenAccess() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) {
|
||||||
|
final long count = target
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.map(i -> i.getAccessright().getClassid())
|
||||||
|
.filter(right -> right.equals(BrokerConstants.OPEN_ACCESS))
|
||||||
|
.count();
|
||||||
|
|
||||||
|
if (count > 0) {
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||||
|
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||||
|
.flatMap(s -> s)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_OA_VERSION,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, i) -> p.getInstances().add(i),
|
||||||
|
Instance::getUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,45 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Pid;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingPid extends UpdateMatcher<Pid> {
|
||||||
|
|
||||||
|
public EnrichMissingPid() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) {
|
||||||
|
final long count = target.getPid().size();
|
||||||
|
|
||||||
|
if (count > 0) {
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.map(ConversionUtils::oafPidToBrokerPid)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_PID,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, pid) -> p.getPids().add(pid),
|
||||||
|
pid -> pid.getType() + "::" + pid.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Project;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingProject extends UpdateMatcher<Project> {
|
||||||
|
|
||||||
|
public EnrichMissingProject() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Project>> findUpdates(final Result source, final Result target) {
|
||||||
|
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Project> generateUpdateInfo(final Project highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_PROJECT,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, prj) -> p.getProjects().add(prj),
|
||||||
|
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
|
||||||
|
|
||||||
|
public EnrichMissingPublicationDate() {
|
||||||
|
super(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) {
|
||||||
|
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
||||||
|
return Arrays.asList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<String> generateUpdateInfo(final String highlightValue, final Result source,
|
||||||
|
final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MISSING_PUBLICATION_DATE,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, date) -> p.setPublicationdate(date),
|
||||||
|
s -> s);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,53 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
public class EnrichMissingSubject extends UpdateMatcher<Pair<String, String>> {
|
||||||
|
|
||||||
|
public EnrichMissingSubject() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||||
|
final Set<String> existingTypes = target
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.map(StructuredProperty::getQualifier)
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid()))
|
||||||
|
.map(ConversionUtils::oafSubjectToPair)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()),
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, pair) -> p.getSubjects().add(pair.getRight()),
|
||||||
|
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,53 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Instance;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMoreOpenAccess extends UpdateMatcher<Instance> {
|
||||||
|
|
||||||
|
public EnrichMoreOpenAccess() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) {
|
||||||
|
final Set<String> urls = target
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||||
|
.map(i -> i.getUrl())
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||||
|
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||||
|
.flatMap(s -> s)
|
||||||
|
.filter(i -> !urls.contains(i.getUrl()))
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MORE_OA_VERSION,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, i) -> p.getInstances().add(i),
|
||||||
|
Instance::getUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Pid;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMorePid extends UpdateMatcher<Pid> {
|
||||||
|
|
||||||
|
public EnrichMorePid() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) {
|
||||||
|
final Set<String> existingPids = target
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
|
||||||
|
.map(ConversionUtils::oafPidToBrokerPid)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.ENRICH_MORE_PID,
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, pid) -> p.getPids().add(pid),
|
||||||
|
pid -> pid.getType() + "::" + pid.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class EnrichMoreSubject extends UpdateMatcher<Pair<String, String>> {
|
||||||
|
|
||||||
|
public EnrichMoreSubject() {
|
||||||
|
super(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||||
|
final Set<String> existingSubjects = target
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return source
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
|
||||||
|
.map(ConversionUtils::oafSubjectToPair)
|
||||||
|
.map(i -> generateUpdateInfo(i, source, target))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||||
|
final Result source,
|
||||||
|
final Result target) {
|
||||||
|
|
||||||
|
return new UpdateInfo<>(
|
||||||
|
Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()),
|
||||||
|
highlightValue, source, target,
|
||||||
|
(p, pair) -> p.getSubjects().add(pair.getRight()),
|
||||||
|
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public abstract class UpdateMatcher<T> {
|
||||||
|
|
||||||
|
private final boolean multipleUpdate;
|
||||||
|
|
||||||
|
public UpdateMatcher(final boolean multipleUpdate) {
|
||||||
|
this.multipleUpdate = multipleUpdate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final Result res, final Result... others) {
|
||||||
|
|
||||||
|
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
|
||||||
|
|
||||||
|
for (final Result source : others) {
|
||||||
|
if (source != res) {
|
||||||
|
for (final UpdateInfo<T> info : findUpdates(source, res)) {
|
||||||
|
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
|
||||||
|
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
|
||||||
|
} else {
|
||||||
|
infoMap.put(s, info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final Collection<UpdateInfo<T>> values = infoMap.values();
|
||||||
|
|
||||||
|
if (values.isEmpty() || multipleUpdate) {
|
||||||
|
return values;
|
||||||
|
} else {
|
||||||
|
final UpdateInfo<T> v = values
|
||||||
|
.stream()
|
||||||
|
.sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust()))
|
||||||
|
.findFirst()
|
||||||
|
.get();
|
||||||
|
return Arrays.asList(v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract List<UpdateInfo<T>> findUpdates(Result source, Result target);
|
||||||
|
|
||||||
|
protected abstract UpdateInfo<T> generateUpdateInfo(final T highlightValue, final Result source,
|
||||||
|
final Result target);
|
||||||
|
|
||||||
|
protected static boolean isMissing(final List<Field<String>> list) {
|
||||||
|
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.util;
|
||||||
|
|
||||||
|
public class BrokerConstants {
|
||||||
|
|
||||||
|
public final static String OPEN_ACCESS = "OPEN";
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.util;
|
||||||
|
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.Instance;
|
||||||
|
import eu.dnetlib.broker.objects.Pid;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
public class ConversionUtils {
|
||||||
|
|
||||||
|
public static Stream<Instance> oafInstanceToBrokerInstances(final eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||||
|
return i.getUrl().stream().map(url -> {
|
||||||
|
final Instance r = new Instance();
|
||||||
|
r.setUrl(url);
|
||||||
|
r.setInstancetype(i.getInstancetype().getClassid());
|
||||||
|
r.setLicense(BrokerConstants.OPEN_ACCESS);
|
||||||
|
r.setHostedby(i.getHostedby().getValue());
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Pid oafPidToBrokerPid(final StructuredProperty sp) {
|
||||||
|
final Pid pid = new Pid();
|
||||||
|
pid.setValue(sp.getValue());
|
||||||
|
pid.setType(sp.getQualifier().getClassid());
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final Pair<String, String> oafSubjectToPair(final StructuredProperty sp) {
|
||||||
|
return Pair.of(sp.getQualifier().getClassid(), sp.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,31 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingAbstract extends UpdateInfo<String> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingAbstract> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingAbstract(final String highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/ABSTRACT", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getAbstracts().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,31 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingAuthorOrcid extends UpdateInfo<String> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingAuthorOrcid> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingAuthorOrcid(final String highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/AUTHOR/ORCID", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
// TODO
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,32 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.Instance;
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingOpenAccess extends UpdateInfo<Instance> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingOpenAccess> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingOpenAccess(final Instance highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/OPENACCESS_VERSION", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getInstances().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue().getUrl();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,32 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.broker.objects.Pid;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingPid extends UpdateInfo<Pid> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingPid> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingPid(final Pid highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/PID", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getPids().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue().getType() + "::" + getHighlightValue().getValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,33 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.broker.objects.Project;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingProject extends UpdateInfo<Project> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingProject> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingProject(final Project highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/PROJECT", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getProjects().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue().getFunder() + "::" + getHighlightValue().getFundingProgram()
|
|
||||||
+ getHighlightValue().getCode();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,31 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingPublicationDate extends UpdateInfo<String> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingPublicationDate> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingPublicationDate(final String highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/PUBLICATION_DATE", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().setPublicationdate(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,36 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMissingSubject extends UpdateInfo<String> {
|
|
||||||
|
|
||||||
public static List<EnrichMissingSubject> findUpdates(final Result source, final Result target) {
|
|
||||||
// MESHEUROPMC
|
|
||||||
// ARXIV
|
|
||||||
// JEL
|
|
||||||
// DDC
|
|
||||||
// ACM
|
|
||||||
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMissingSubject(final String subjectClassification, final String highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MISSING/SUBJECT/" + subjectClassification, highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getSubjects().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,32 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.Instance;
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMoreOpenAccess extends UpdateInfo<Instance> {
|
|
||||||
|
|
||||||
public static List<EnrichMoreOpenAccess> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMoreOpenAccess(final Instance highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MORE/OPENACCESS_VERSION", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getInstances().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue().getUrl();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,32 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.broker.objects.Pid;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMorePid extends UpdateInfo<Pid> {
|
|
||||||
|
|
||||||
public static List<EnrichMorePid> findUpdates(final Result source, final Result target) {
|
|
||||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMorePid(final Pid highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MORE/PID", highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getPids().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue().getType() + "::" + getHighlightValue().getValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,36 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class EnrichMoreSubject extends UpdateInfo<String> {
|
|
||||||
|
|
||||||
public static List<EnrichMoreSubject> findUpdates(final Result source, final Result target) {
|
|
||||||
// MESHEUROPMC
|
|
||||||
// ARXIV
|
|
||||||
// JEL
|
|
||||||
// DDC
|
|
||||||
// ACM
|
|
||||||
|
|
||||||
return Arrays.asList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnrichMoreSubject(final String subjectClassification, final String highlightValue, final float trust) {
|
|
||||||
super("ENRICH/MORE/SUBJECT/" + subjectClassification, highlightValue, trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
|
||||||
payload.getHighlight().getSubjects().add(getHighlightValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getHighlightValueAsString() {
|
|
||||||
return getHighlightValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,36 +1,77 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
package eu.dnetlib.dhp.broker.oa.util;
|
||||||
|
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
||||||
|
import eu.dnetlib.broker.objects.Publication;
|
||||||
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public abstract class UpdateInfo<T> {
|
public final class UpdateInfo<T> {
|
||||||
|
|
||||||
private final String topic;
|
private final Topic topic;
|
||||||
|
|
||||||
private final T highlightValue;
|
private final T highlightValue;
|
||||||
|
|
||||||
|
private final Result source;
|
||||||
|
|
||||||
|
private final Result target;
|
||||||
|
|
||||||
|
private final BiConsumer<Publication, T> compileHighlight;
|
||||||
|
|
||||||
|
private final Function<T, String> highlightToString;
|
||||||
|
|
||||||
private final float trust;
|
private final float trust;
|
||||||
|
|
||||||
protected UpdateInfo(final String topic, final T highlightValue, final float trust) {
|
public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target,
|
||||||
|
final BiConsumer<Publication, T> compileHighlight,
|
||||||
|
final Function<T, String> highlightToString) {
|
||||||
this.topic = topic;
|
this.topic = topic;
|
||||||
this.highlightValue = highlightValue;
|
this.highlightValue = highlightValue;
|
||||||
this.trust = trust;
|
this.source = source;
|
||||||
|
this.target = target;
|
||||||
|
this.compileHighlight = compileHighlight;
|
||||||
|
this.highlightToString = highlightToString;
|
||||||
|
this.trust = calculateTrust(source, target);
|
||||||
}
|
}
|
||||||
|
|
||||||
public T getHighlightValue() {
|
public T getHighlightValue() {
|
||||||
return highlightValue;
|
return highlightValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Result getSource() {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Result getTarget() {
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
|
||||||
|
private float calculateTrust(final Result source, final Result target) {
|
||||||
|
// TODO
|
||||||
|
return 0.9f;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Topic getTopic() {
|
||||||
|
return topic;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTopicPath() {
|
||||||
|
return topic.getPath();
|
||||||
|
}
|
||||||
|
|
||||||
public float getTrust() {
|
public float getTrust() {
|
||||||
return trust;
|
return trust;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTopic() {
|
public void compileHighlight(final OpenAireEventPayload payload) {
|
||||||
return topic;
|
compileHighlight.accept(payload.getHighlight(), getHighlightValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
abstract public void compileHighlight(OpenAireEventPayload payload);
|
public String getHighlightValueAsString() {
|
||||||
|
return highlightToString.apply(getHighlightValue());
|
||||||
abstract public String getHighlightValueAsString();
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -129,6 +129,9 @@ public class DedupUtility {
|
||||||
.max(Comparator.comparing(Tuple2::_1));
|
.max(Comparator.comparing(Tuple2::_1));
|
||||||
if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
|
if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
|
||||||
Author r = simAuhtor.get()._2();
|
Author r = simAuhtor.get()._2();
|
||||||
|
if (r.getPid() == null) {
|
||||||
|
r.setPid(new ArrayList<>());
|
||||||
|
}
|
||||||
r.getPid().add(a._1());
|
r.getPid().add(a._1());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
@ -0,0 +1,104 @@
|
||||||
|
package eu.dnetlib.doiboost
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, KeyValue, Qualifier, Result, StructuredProperty}
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
|
|
||||||
|
object DoiBoostMappingUtil {
|
||||||
|
|
||||||
|
//STATIC STRING
|
||||||
|
val MAG = "microsoft"
|
||||||
|
val ORCID = "ORCID"
|
||||||
|
val CROSSREF = "Crossref"
|
||||||
|
val UNPAYWALL = "UnpayWall"
|
||||||
|
val GRID_AC = "grid.ac"
|
||||||
|
val WIKPEDIA = "wikpedia"
|
||||||
|
val doiBoostNSPREFIX = "doiboost____"
|
||||||
|
val OPENAIRE_PREFIX = "openaire____"
|
||||||
|
val SEPARATOR = "::"
|
||||||
|
val DNET_LANGUAGES = "dnet:languages"
|
||||||
|
val PID_TYPES = "dnet:pid_types"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def generateDataInfo(): DataInfo = {
|
||||||
|
val di = new DataInfo
|
||||||
|
di.setDeletedbyinference(false)
|
||||||
|
di.setInferred(false)
|
||||||
|
di.setInvisible(false)
|
||||||
|
di.setTrust("0.9")
|
||||||
|
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
||||||
|
di
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
|
||||||
|
val sp = new StructuredProperty
|
||||||
|
sp.setQualifier(createQualifier(classId, schemeId))
|
||||||
|
sp.setValue(value)
|
||||||
|
sp
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
||||||
|
val sp = new StructuredProperty
|
||||||
|
sp.setQualifier(createQualifier(classId, schemeId))
|
||||||
|
sp.setValue(value)
|
||||||
|
sp.setDataInfo(dataInfo)
|
||||||
|
sp
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def createCrossrefCollectedFrom(): KeyValue = {
|
||||||
|
|
||||||
|
val cf = new KeyValue
|
||||||
|
cf.setValue(CROSSREF)
|
||||||
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5("crossref"))
|
||||||
|
cf
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateIdentifier(oaf: Result, doi: String): String = {
|
||||||
|
val id = DHPUtils.md5(doi.toLowerCase)
|
||||||
|
if (oaf.isInstanceOf[Dataset])
|
||||||
|
return s"60|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
||||||
|
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def createMAGCollectedFrom(): KeyValue = {
|
||||||
|
|
||||||
|
val cf = new KeyValue
|
||||||
|
cf.setValue(MAG)
|
||||||
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(MAG))
|
||||||
|
cf
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
|
||||||
|
val q = new Qualifier
|
||||||
|
q.setClassid(clsName)
|
||||||
|
q.setClassname(clsValue)
|
||||||
|
q.setSchemeid(schName)
|
||||||
|
q.setSchemename(schValue)
|
||||||
|
q
|
||||||
|
}
|
||||||
|
|
||||||
|
def createQualifier(cls: String, sch: String): Qualifier = {
|
||||||
|
createQualifier(cls, cls, sch, sch)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def asField[T](value: T): Field[T] = {
|
||||||
|
val tmp = new Field[T]
|
||||||
|
tmp.setValue(value)
|
||||||
|
tmp
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -14,6 +14,7 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.collection.mutable
|
import scala.collection.mutable
|
||||||
import scala.util.matching.Regex
|
import scala.util.matching.Regex
|
||||||
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||||
|
|
||||||
case class mappingAffiliation(name: String) {}
|
case class mappingAffiliation(name: String) {}
|
||||||
|
|
||||||
|
@ -25,18 +26,7 @@ case class mappingFunder(name: String, DOI: Option[String], award: Option[List[S
|
||||||
case object Crossref2Oaf {
|
case object Crossref2Oaf {
|
||||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||||
|
|
||||||
//STATIC STRING
|
|
||||||
val MAG = "MAG"
|
|
||||||
val ORCID = "ORCID"
|
|
||||||
val CROSSREF = "Crossref"
|
|
||||||
val UNPAYWALL = "UnpayWall"
|
|
||||||
val GRID_AC = "grid.ac"
|
|
||||||
val WIKPEDIA = "wikpedia"
|
|
||||||
val doiBoostNSPREFIX = "doiboost____"
|
|
||||||
val OPENAIRE_PREFIX = "openaire____"
|
|
||||||
val SEPARATOR = "::"
|
|
||||||
val DNET_LANGUAGES = "dnet:languages"
|
|
||||||
val PID_TYPES = "dnet:pid_types"
|
|
||||||
|
|
||||||
val mappingCrossrefType = Map(
|
val mappingCrossrefType = Map(
|
||||||
"book-section" -> "publication",
|
"book-section" -> "publication",
|
||||||
|
@ -116,7 +106,7 @@ case object Crossref2Oaf {
|
||||||
result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long])
|
result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long])
|
||||||
result.setDateofcollection((json \ "indexed" \ "date-time").extract[String])
|
result.setDateofcollection((json \ "indexed" \ "date-time").extract[String])
|
||||||
|
|
||||||
result.setCollectedfrom(List(createCollectedFrom()).asJava)
|
result.setCollectedfrom(List(createCrossrefCollectedFrom()).asJava)
|
||||||
|
|
||||||
// Publisher ( Name of work's publisher mapped into Result/Publisher)
|
// Publisher ( Name of work's publisher mapped into Result/Publisher)
|
||||||
val publisher = (json \ "publisher").extractOrElse[String](null)
|
val publisher = (json \ "publisher").extractOrElse[String](null)
|
||||||
|
@ -168,7 +158,7 @@ case object Crossref2Oaf {
|
||||||
result.setInstance(List(instance).asJava)
|
result.setInstance(List(instance).asJava)
|
||||||
instance.setInstancetype(createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), "dnet:publication_resource", "dnet:publication_resource"))
|
instance.setInstancetype(createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), "dnet:publication_resource", "dnet:publication_resource"))
|
||||||
|
|
||||||
instance.setCollectedfrom(createCollectedFrom())
|
instance.setCollectedfrom(createCrossrefCollectedFrom())
|
||||||
if (StringUtils.isNotBlank(issuedDate)) {
|
if (StringUtils.isNotBlank(issuedDate)) {
|
||||||
instance.setDateofacceptance(asField(issuedDate))
|
instance.setDateofacceptance(asField(issuedDate))
|
||||||
}
|
}
|
||||||
|
@ -215,7 +205,7 @@ case object Crossref2Oaf {
|
||||||
val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List())
|
val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List())
|
||||||
|
|
||||||
if (funderList.nonEmpty) {
|
if (funderList.nonEmpty) {
|
||||||
resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp)
|
resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCrossrefCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -416,71 +406,8 @@ case object Crossref2Oaf {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateIdentifier(oaf: Result, doi: String): String = {
|
|
||||||
val id = DHPUtils.md5(doi.toLowerCase)
|
|
||||||
if (oaf.isInstanceOf[Dataset])
|
|
||||||
return s"60|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
|
||||||
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
|
||||||
}
|
|
||||||
|
|
||||||
def asField[T](value: T): Field[T] = {
|
|
||||||
val tmp = new Field[T]
|
|
||||||
tmp.setValue(value)
|
|
||||||
tmp
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def generateDataInfo(): DataInfo = {
|
|
||||||
val di = new DataInfo
|
|
||||||
di.setDeletedbyinference(false)
|
|
||||||
di.setInferred(false)
|
|
||||||
di.setInvisible(false)
|
|
||||||
di.setTrust("0.9")
|
|
||||||
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
|
||||||
di
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
|
|
||||||
val sp = new StructuredProperty
|
|
||||||
sp.setQualifier(createQualifier(classId, schemeId))
|
|
||||||
sp.setValue(value)
|
|
||||||
sp
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
|
||||||
val sp = new StructuredProperty
|
|
||||||
sp.setQualifier(createQualifier(classId, schemeId))
|
|
||||||
sp.setValue(value)
|
|
||||||
sp.setDataInfo(dataInfo)
|
|
||||||
sp
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
def createCollectedFrom(): KeyValue = {
|
|
||||||
|
|
||||||
val cf = new KeyValue
|
|
||||||
cf.setValue(CROSSREF)
|
|
||||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5("crossref"))
|
|
||||||
cf
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
|
|
||||||
val q = new Qualifier
|
|
||||||
q.setClassid(clsName)
|
|
||||||
q.setClassname(clsValue)
|
|
||||||
q.setSchemeid(schName)
|
|
||||||
q.setSchemename(schValue)
|
|
||||||
q
|
|
||||||
}
|
|
||||||
|
|
||||||
def createQualifier(cls: String, sch: String): Qualifier = {
|
|
||||||
createQualifier(cls, cls, sch, sch)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def generateItemFromType(objectType: String, objectSubType: String): Result = {
|
def generateItemFromType(objectType: String, objectSubType: String): Result = {
|
||||||
|
|
|
@ -1,52 +1,215 @@
|
||||||
package eu.dnetlib.doiboost.mag
|
package eu.dnetlib.doiboost.mag
|
||||||
|
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication}
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||||
|
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.collection.mutable
|
||||||
|
import scala.util.matching.Regex
|
||||||
|
|
||||||
|
|
||||||
case class Papers(PaperId:Long, Rank:Integer, Doi:String,
|
case class MagPapers(PaperId: Long, Rank: Integer, Doi: String,
|
||||||
DocType:String, PaperTitle:String, OriginalTitle:String,
|
DocType: String, PaperTitle: String, OriginalTitle: String,
|
||||||
BookTitle:String, Year:Option[Integer], Date:Option[java.sql.Timestamp], Publisher:String,
|
BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String,
|
||||||
JournalId:Option[Long], ConferenceSeriesId:Option[Long], ConferenceInstanceId:Option[Long],
|
JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long],
|
||||||
Volume:String, Issue:String, FirstPage:String, LastPage:String,
|
Volume: String, Issue: String, FirstPage: String, LastPage: String,
|
||||||
ReferenceCount:Option[Long], CitationCount:Option[Long], EstimatedCitation:Option[Long],
|
ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long],
|
||||||
OriginalVenue:String, FamilyId:Option[Long], CreatedDate:java.sql.Timestamp) {}
|
OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {}
|
||||||
|
|
||||||
|
|
||||||
case class PaperAbstract(PaperId:Long,IndexedAbstract:String) {}
|
case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {}
|
||||||
|
|
||||||
|
case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
||||||
|
|
||||||
|
case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {}
|
||||||
|
|
||||||
|
case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {}
|
||||||
|
|
||||||
|
|
||||||
|
case class MagAuthorAffiliation(author: MagAuthor, affiliation:String)
|
||||||
|
|
||||||
|
case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {}
|
||||||
|
|
||||||
|
case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String) {}
|
||||||
|
|
||||||
|
case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {}
|
||||||
|
|
||||||
|
case class MagUrl(PaperId: Long, instances: List[String])
|
||||||
|
|
||||||
|
|
||||||
|
case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
||||||
|
|
||||||
|
|
||||||
case object ConversionUtil {
|
case object ConversionUtil {
|
||||||
|
|
||||||
|
def extractMagIdentifier(pids:mutable.Buffer[String]) :String ={
|
||||||
|
val magIDRegex: Regex = "^[0-9]+$".r
|
||||||
|
val s =pids.filter(p=> magIDRegex.findAllIn(p).hasNext)
|
||||||
|
|
||||||
|
if (s.nonEmpty)
|
||||||
def transformPaperAbstract(input:PaperAbstract) : PaperAbstract = {
|
return s.head
|
||||||
PaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def convertInvertedIndexString(json_input:String) :String = {
|
def addInstances(a: (Publication, MagUrl)): Publication = {
|
||||||
|
val pub = a._1
|
||||||
|
val urls = a._2
|
||||||
|
|
||||||
|
|
||||||
|
val i = new Instance
|
||||||
|
|
||||||
|
|
||||||
|
if (urls!= null) {
|
||||||
|
|
||||||
|
val l:List[String] = urls.instances.filter(k=>k.nonEmpty):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}")
|
||||||
|
|
||||||
|
i.setUrl(l.asJava)
|
||||||
|
}
|
||||||
|
else
|
||||||
|
i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava)
|
||||||
|
|
||||||
|
i.setCollectedfrom(createMAGCollectedFrom())
|
||||||
|
pub.setInstance(List(i).asJava)
|
||||||
|
pub
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = {
|
||||||
|
MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = {
|
||||||
|
val paper = inputParams._1._1
|
||||||
|
val journal = inputParams._1._2
|
||||||
|
val authors = inputParams._2
|
||||||
|
|
||||||
|
val pub = new Publication
|
||||||
|
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
|
||||||
|
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
|
||||||
|
|
||||||
|
//Set identifier as {50|60} | doiboost____::md5(DOI)
|
||||||
|
pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase))
|
||||||
|
|
||||||
|
val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title")
|
||||||
|
val originalTitles = createSP(paper.OriginalTitle, "alternative title", "dnet:dataCite_title")
|
||||||
|
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
||||||
|
|
||||||
|
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
||||||
|
|
||||||
|
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
||||||
|
|
||||||
|
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
||||||
|
|
||||||
|
a.setFullname(f.author.DisplayName.get)
|
||||||
|
|
||||||
|
if(f.affiliation!= null)
|
||||||
|
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||||
|
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", PID_TYPES)).asJava)
|
||||||
|
a
|
||||||
|
}
|
||||||
|
pub.setAuthor(authorsOAF.asJava)
|
||||||
|
|
||||||
|
|
||||||
|
if (paper.Date != null && paper.Date.isDefined) {
|
||||||
|
pub.setDateofacceptance(asField(paper.Date.get.toString))
|
||||||
|
}
|
||||||
|
pub.setPublisher(asField(paper.Publisher))
|
||||||
|
|
||||||
|
|
||||||
|
if (journal != null && journal.DisplayName.isDefined) {
|
||||||
|
val j = new Journal
|
||||||
|
|
||||||
|
j.setName(journal.DisplayName.get)
|
||||||
|
j.setSp(paper.FirstPage)
|
||||||
|
j.setEp(paper.LastPage)
|
||||||
|
if (journal.Publisher.isDefined)
|
||||||
|
j.setEdition(journal.Publisher.get)
|
||||||
|
if (journal.Issn.isDefined)
|
||||||
|
j.setIssnPrinted(journal.Issn.get)
|
||||||
|
pub.setJournal(j)
|
||||||
|
}
|
||||||
|
pub
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = {
|
||||||
|
|
||||||
|
val paper = inputParams._1._1
|
||||||
|
val authors = inputParams._1._2
|
||||||
|
val description = inputParams._2
|
||||||
|
|
||||||
|
val pub = new Publication
|
||||||
|
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
|
||||||
|
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
|
||||||
|
|
||||||
|
//Set identifier as {50|60} | doiboost____::md5(DOI)
|
||||||
|
pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase))
|
||||||
|
|
||||||
|
val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title")
|
||||||
|
val originalTitles = createSP(paper.OriginalTitle, "alternative title", "dnet:dataCite_title")
|
||||||
|
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
||||||
|
|
||||||
|
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
||||||
|
|
||||||
|
|
||||||
|
if (description != null) {
|
||||||
|
pub.setDescription(List(asField(description.IndexedAbstract)).asJava)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
||||||
|
|
||||||
|
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
||||||
|
|
||||||
|
a.setFullname(f.author.DisplayName.get)
|
||||||
|
|
||||||
|
if(f.affiliation!= null)
|
||||||
|
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||||
|
|
||||||
|
|
||||||
|
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", PID_TYPES)).asJava)
|
||||||
|
|
||||||
|
a
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (paper.Date != null) {
|
||||||
|
pub.setDateofacceptance(asField(paper.Date.toString))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub.setAuthor(authorsOAF.asJava)
|
||||||
|
|
||||||
|
|
||||||
|
pub
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def convertInvertedIndexString(json_input: String): String = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(json_input)
|
lazy val json: json4s.JValue = parse(json_input)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val idl = (json \ "IndexLength").extract[Int]
|
val idl = (json \ "IndexLength").extract[Int]
|
||||||
|
|
||||||
if (idl > 0) {
|
if (idl > 0) {
|
||||||
val res = Array.ofDim[String](idl)
|
val res = Array.ofDim[String](idl)
|
||||||
|
|
||||||
val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
|
val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
|
||||||
|
|
||||||
for {(k:String,v:List[Int]) <- iid}{
|
for {(k: String, v: List[Int]) <- iid} {
|
||||||
v.foreach(item => res(item) = k)
|
v.foreach(item => res(item) = k)
|
||||||
}
|
}
|
||||||
|
(0 until idl).foreach(i => {
|
||||||
|
if (res(i) == null)
|
||||||
|
res(i) = ""
|
||||||
|
})
|
||||||
return res.mkString(" ")
|
return res.mkString(" ")
|
||||||
|
|
||||||
}
|
}
|
||||||
""
|
""
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,17 @@
|
||||||
package eu.dnetlib.doiboost.mag
|
package eu.dnetlib.doiboost.mag
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||||
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil.asField
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
import org.apache.spark.sql.functions._
|
import org.apache.spark.sql.functions._
|
||||||
|
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
object SparkPreProcessMAG {
|
object SparkPreProcessMAG {
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,15 +27,21 @@ object SparkPreProcessMAG {
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
|
||||||
|
val sourcePath = parser.get("sourcePath")
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
||||||
|
implicit val tupleForJoinEncoder = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logger.info("Phase 1) make uninque DOI in Papers:")
|
logger.info("Phase 1) make uninque DOI in Papers:")
|
||||||
|
|
||||||
val d: Dataset[Papers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[Papers]
|
val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
|
||||||
|
|
||||||
|
|
||||||
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
||||||
val result: RDD[Papers] = d.where(col("Doi").isNotNull).rdd.map { p: Papers => Tuple2(p.Doi, p) }.reduceByKey { case (p1: Papers, p2: Papers) =>
|
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull).rdd.map { p: MagPapers => Tuple2(p.Doi, p) }.reduceByKey { case (p1: MagPapers, p2: MagPapers) =>
|
||||||
var r = if (p1 == null) p2 else p1
|
var r = if (p1 == null) p2 else p1
|
||||||
if (p1 != null && p2 != null) {
|
if (p1 != null && p2 != null) {
|
||||||
if (p1.CreatedDate != null && p2.CreatedDate != null) {
|
if (p1.CreatedDate != null && p2.CreatedDate != null) {
|
||||||
|
@ -46,16 +56,83 @@ object SparkPreProcessMAG {
|
||||||
r
|
r
|
||||||
}.map(_._2)
|
}.map(_._2)
|
||||||
|
|
||||||
val distinctPaper: Dataset[Papers] = spark.createDataset(result)
|
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
|
||||||
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
|
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
|
||||||
logger.info(s"Total number of element: ${result.count()}")
|
logger.info(s"Total number of element: ${result.count()}")
|
||||||
|
|
||||||
logger.info("Phase 2) convert InverdIndex Abastrac to string")
|
logger.info("Phase 3) Group Author by PaperId")
|
||||||
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[PaperAbstract]
|
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
||||||
|
|
||||||
|
val affiliation =spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
|
||||||
|
|
||||||
|
val paperAuthorAffiliation =spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
|
||||||
|
|
||||||
|
|
||||||
|
paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
|
||||||
|
.map{case (a:MagPaperAuthorAffiliation,b:MagAuthor )=> (a.AffiliationId,MagPaperAuthorDenormalized(a.PaperId, b, null)) }
|
||||||
|
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
|
||||||
|
.map(s => {
|
||||||
|
val mpa = s._1._2
|
||||||
|
val af = s._2
|
||||||
|
if (af!= null) {
|
||||||
|
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName)
|
||||||
|
} else
|
||||||
|
mpa
|
||||||
|
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
|
||||||
|
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
|
||||||
|
|
||||||
|
|
||||||
|
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
||||||
|
|
||||||
|
val papers =spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
|
||||||
|
|
||||||
|
val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
val firstJoin =papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")),"left")
|
||||||
|
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
|
||||||
|
.map { a: ((MagPapers, MagJournal), MagPaperWithAuthorList) => ConversionUtil.createOAFFromJournalAuthorPaper(a) }.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
var magPubs:Dataset[(String,Publication)] = spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication].map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String,Publication)]
|
||||||
|
|
||||||
|
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
|
||||||
|
|
||||||
|
|
||||||
|
logger.info("Phase 5) enrich publication with URL and Instances")
|
||||||
|
|
||||||
|
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
|
||||||
|
.map{a:((String,Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2))}
|
||||||
|
.write.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"${parser.get("targetPath")}/merge_step_3")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
logger.info("Phase 6) Enrich Publication with description")
|
||||||
|
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||||
|
|
||||||
|
val paperAbstract =spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
|
||||||
|
|
||||||
distinctPaper.joinWith(pa, col("PaperId").eqia)
|
|
||||||
|
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication].map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String,Publication)]
|
||||||
|
|
||||||
|
magPubs.joinWith(paperAbstract,col("_1").equalTo(paperAbstract("PaperId")), "left").map(p=>
|
||||||
|
{
|
||||||
|
val pub = p._1._2
|
||||||
|
val abst = p._2
|
||||||
|
if (abst!= null) {
|
||||||
|
pub.setDescription(List(asField(abst.IndexedAbstract)).asJava)
|
||||||
|
}
|
||||||
|
pub
|
||||||
|
}
|
||||||
|
).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -72,6 +72,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
${sparkExtraOPT}
|
${sparkExtraOPT}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
|
|
@ -11,6 +11,10 @@
|
||||||
<name>queueName</name>
|
<name>queueName</name>
|
||||||
<value>default</value>
|
<value>default</value>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
<value>spark2</value>
|
<value>spark2</value>
|
||||||
|
|
|
@ -20,6 +20,10 @@
|
||||||
<name>sparkExecutorCores</name>
|
<name>sparkExecutorCores</name>
|
||||||
<description>number of cores used by single executor</description>
|
<description>number of cores used by single executor</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ResetWorkingPath"/>
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
|
@ -1,10 +1,21 @@
|
||||||
package eu.dnetlib.doiboost.mag
|
package eu.dnetlib.doiboost.mag
|
||||||
|
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||||
|
import org.apache.htrace.fasterxml.jackson.databind.SerializationFeature
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.api.java.function.MapFunction
|
||||||
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
import org.junit.jupiter.api.Assertions._
|
import org.junit.jupiter.api.Assertions._
|
||||||
|
import org.apache.spark.sql.functions._
|
||||||
|
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
import scala.reflect.ClassTag
|
||||||
|
import scala.util.matching.Regex
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MAGMappingTest {
|
class MAGMappingTest {
|
||||||
|
@ -13,14 +24,49 @@ class MAGMappingTest {
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
|
||||||
//@Test
|
@Test
|
||||||
def testMAGCSV(): Unit = {
|
def testMAGCSV(): Unit = {
|
||||||
SparkPreProcessMAG.main("-m local[*] -s /data/doiboost/mag/datasets -t /data/doiboost/mag/datasets/preprocess".split(" "))
|
// SparkPreProcessMAG.main("-m local[*] -s /data/doiboost/mag/datasets -t /data/doiboost/mag/datasets/preprocess".split(" "))
|
||||||
|
|
||||||
|
val sparkConf: SparkConf = new SparkConf
|
||||||
|
|
||||||
|
val spark: SparkSession = SparkSession.builder()
|
||||||
|
.config(sparkConf)
|
||||||
|
.appName(getClass.getSimpleName)
|
||||||
|
.master("local[*]")
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
|
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
||||||
|
implicit val longBarEncoder = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||||
|
|
||||||
|
val sourcePath = "/data/doiboost/mag/input"
|
||||||
|
|
||||||
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
|
|
||||||
|
|
||||||
|
val magOAF = spark.read.load("$sourcePath/merge_step_4").as[Publication]
|
||||||
|
|
||||||
|
println(magOAF.first().getOriginalId)
|
||||||
|
|
||||||
|
|
||||||
|
magOAF.map(k => (ConversionUtil.extractMagIdentifier(k.getOriginalId.asScala),k)).as[(String,Publication)].show()
|
||||||
|
|
||||||
|
|
||||||
|
println((ConversionUtil.extractMagIdentifier(magOAF.first().getOriginalId.asScala)))
|
||||||
|
|
||||||
|
val magIDRegex: Regex = "^[0-9]+$".r
|
||||||
|
|
||||||
|
|
||||||
|
println(magIDRegex.findFirstMatchIn("suca").isDefined)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def buildInvertedIndexTest() :Unit = {
|
def buildInvertedIndexTest(): Unit = {
|
||||||
val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString
|
val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString
|
||||||
val description = ConversionUtil.convertInvertedIndexString(json_input)
|
val description = ConversionUtil.convertInvertedIndexString(json_input)
|
||||||
assertNotNull(description)
|
assertNotNull(description)
|
||||||
|
@ -32,3 +78,5 @@ class MAGMappingTest {
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class PropagationConstant {
|
public class PropagationConstant {
|
||||||
|
@ -24,10 +26,6 @@ public class PropagationConstant {
|
||||||
|
|
||||||
public static final String TRUE = "true";
|
public static final String TRUE = "true";
|
||||||
|
|
||||||
public static final String DNET_COUNTRY_SCHEMA = "dnet:countries";
|
|
||||||
public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
|
|
||||||
public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
|
|
||||||
|
|
||||||
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos";
|
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos";
|
||||||
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories";
|
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories";
|
||||||
|
|
||||||
|
@ -46,22 +44,6 @@ public class PropagationConstant {
|
||||||
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
|
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
|
||||||
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
|
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
|
||||||
|
|
||||||
public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "isProvidedBy";
|
|
||||||
|
|
||||||
public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization";
|
|
||||||
public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation";
|
|
||||||
public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf";
|
|
||||||
public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution";
|
|
||||||
|
|
||||||
public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult";
|
|
||||||
|
|
||||||
public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject";
|
|
||||||
public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome";
|
|
||||||
public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy";
|
|
||||||
public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces";
|
|
||||||
|
|
||||||
public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges";
|
|
||||||
|
|
||||||
public static final String PROPAGATION_AUTHOR_PID = "ORCID";
|
public static final String PROPAGATION_AUTHOR_PID = "ORCID";
|
||||||
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
@ -76,8 +58,8 @@ public class PropagationConstant {
|
||||||
Country nc = new Country();
|
Country nc = new Country();
|
||||||
nc.setClassid(classid);
|
nc.setClassid(classid);
|
||||||
nc.setClassname(classname);
|
nc.setClassname(classname);
|
||||||
nc.setSchemename(DNET_COUNTRY_SCHEMA);
|
nc.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
|
||||||
nc.setSchemeid(DNET_COUNTRY_SCHEMA);
|
nc.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
|
||||||
nc
|
nc
|
||||||
.setDataInfo(
|
.setDataInfo(
|
||||||
getDataInfo(
|
getDataInfo(
|
||||||
|
@ -102,8 +84,8 @@ public class PropagationConstant {
|
||||||
Qualifier pa = new Qualifier();
|
Qualifier pa = new Qualifier();
|
||||||
pa.setClassid(inference_class_id);
|
pa.setClassid(inference_class_id);
|
||||||
pa.setClassname(inference_class_name);
|
pa.setClassname(inference_class_name);
|
||||||
pa.setSchemeid(DNET_SCHEMA_ID);
|
pa.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||||
pa.setSchemename(DNET_SCHEMA_NAME);
|
pa.setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||||
return pa;
|
return pa;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.bulktag;
|
package eu.dnetlib.dhp.bulktag;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
@ -84,6 +85,7 @@ public class SparkBulkTagJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
|
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -100,7 +101,7 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
+ "JOIN ( SELECT source, target "
|
+ "JOIN ( SELECT source, target "
|
||||||
+ " FROM relation "
|
+ " FROM relation "
|
||||||
+ " WHERE relclass = '"
|
+ " WHERE relclass = '"
|
||||||
+ RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
|
+ ModelConstants.IS_PROVIDED_BY
|
||||||
+ "' "
|
+ "' "
|
||||||
+ " AND datainfo.deletedbyinference = false ) rel "
|
+ " AND datainfo.deletedbyinference = false ) rel "
|
||||||
+ "ON d.id = rel.source "
|
+ "ON d.id = rel.source "
|
||||||
|
|
|
@ -69,13 +69,16 @@ public class SparkCountryPropagationJob {
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> execPropagation(
|
spark -> {
|
||||||
spark,
|
removeOutputDir(spark, outputPath);
|
||||||
sourcePath,
|
execPropagation(
|
||||||
preparedInfoPath,
|
spark,
|
||||||
outputPath,
|
sourcePath,
|
||||||
resultClazz,
|
preparedInfoPath,
|
||||||
saveGraph));
|
outputPath,
|
||||||
|
resultClazz,
|
||||||
|
saveGraph);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <R extends Result> void execPropagation(
|
private static <R extends Result> void execPropagation(
|
||||||
|
|
|
@ -74,9 +74,7 @@ public class PrepareResultOrcidAssociationStep1 {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
|
||||||
}
|
|
||||||
prepareInfo(
|
prepareInfo(
|
||||||
spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
|
spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
|
||||||
});
|
});
|
||||||
|
@ -97,22 +95,22 @@ public class PrepareResultOrcidAssociationStep1 {
|
||||||
Dataset<R> result = readPath(spark, inputResultPath, resultClazz);
|
Dataset<R> result = readPath(spark, inputResultPath, resultClazz);
|
||||||
result.createOrReplaceTempView("result");
|
result.createOrReplaceTempView("result");
|
||||||
|
|
||||||
String query = " select target resultId, author authorList"
|
String query = "SELECT target resultId, author authorList"
|
||||||
+ " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author "
|
+ " FROM (SELECT id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author "
|
||||||
+ " from ( "
|
+ " FROM ( "
|
||||||
+ " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid "
|
+ " SELECT DISTINCT id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid "
|
||||||
+ " from result "
|
+ " FROM result "
|
||||||
+ " lateral view explode (author) a as MyT "
|
+ " LATERAL VIEW EXPLODE (author) a AS MyT "
|
||||||
+ " lateral view explode (MyT.pid) p as MyP "
|
+ " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
|
||||||
+ " where MyP.qualifier.classid = 'ORCID') tmp "
|
+ " WHERE MyP.qualifier.classid = 'ORCID') tmp "
|
||||||
+ " group by id) r_t "
|
+ " GROUP BY id) r_t "
|
||||||
+ " join ("
|
+ " JOIN ("
|
||||||
+ " select source, target "
|
+ " SELECT source, target "
|
||||||
+ " from relation "
|
+ " FROM relation "
|
||||||
+ " where datainfo.deletedbyinference = false "
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
+ getConstraintList(" relclass = '", allowedsemrel)
|
+ getConstraintList(" relclass = '", allowedsemrel)
|
||||||
+ ") rel_rel "
|
+ " ) rel_rel "
|
||||||
+ " on source = id";
|
+ " ON source = id";
|
||||||
spark
|
spark
|
||||||
.sql(query)
|
.sql(query)
|
||||||
.as(Encoders.bean(ResultOrcidList.class))
|
.as(Encoders.bean(ResultOrcidList.class))
|
||||||
|
|
|
@ -50,9 +50,7 @@ public class PrepareResultOrcidAssociationStep2 {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
|
||||||
}
|
|
||||||
mergeInfo(spark, inputPath, outputPath);
|
mergeInfo(spark, inputPath, outputPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -70,11 +70,10 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
if (saveGraph) {
|
||||||
}
|
|
||||||
if (saveGraph)
|
|
||||||
execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
|
execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,16 +131,16 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) {
|
private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) {
|
||||||
boolean toaddpid = false;
|
boolean toaddpid = false;
|
||||||
|
|
||||||
if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) {
|
if (StringUtils.isNotEmpty(autoritative_author.getSurname())) {
|
||||||
if (StringUtils.isNoneEmpty(author.getSurname())) {
|
if (StringUtils.isNotEmpty(author.getSurname())) {
|
||||||
if (autoritative_author
|
if (autoritative_author
|
||||||
.getSurname()
|
.getSurname()
|
||||||
.trim()
|
.trim()
|
||||||
.equalsIgnoreCase(author.getSurname().trim())) {
|
.equalsIgnoreCase(author.getSurname().trim())) {
|
||||||
|
|
||||||
// have the same surname. Check the name
|
// have the same surname. Check the name
|
||||||
if (StringUtils.isNoneEmpty(autoritative_author.getName())) {
|
if (StringUtils.isNotEmpty(autoritative_author.getName())) {
|
||||||
if (StringUtils.isNoneEmpty(author.getName())) {
|
if (StringUtils.isNotEmpty(author.getName())) {
|
||||||
if (autoritative_author
|
if (autoritative_author
|
||||||
.getName()
|
.getName()
|
||||||
.trim()
|
.trim()
|
||||||
|
@ -150,12 +149,14 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
}
|
}
|
||||||
// they could be differently written (i.e. only the initials of the name
|
// they could be differently written (i.e. only the initials of the name
|
||||||
// in one of the two
|
// in one of the two
|
||||||
if (autoritative_author
|
else {
|
||||||
.getName()
|
if (autoritative_author
|
||||||
.trim()
|
.getName()
|
||||||
.substring(0, 0)
|
.trim()
|
||||||
.equalsIgnoreCase(author.getName().trim().substring(0, 0))) {
|
.substring(0, 0)
|
||||||
toaddpid = true;
|
.equalsIgnoreCase(author.getName().trim().substring(0, 0))) {
|
||||||
|
toaddpid = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
public class PrepareProjectResultsAssociation {
|
public class PrepareProjectResultsAssociation {
|
||||||
|
@ -60,6 +61,8 @@ public class PrepareProjectResultsAssociation {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, potentialUpdatePath);
|
||||||
|
removeOutputDir(spark, alreadyLinkedPath);
|
||||||
prepareResultProjProjectResults(
|
prepareResultProjProjectResults(
|
||||||
spark,
|
spark,
|
||||||
inputPath,
|
inputPath,
|
||||||
|
@ -83,7 +86,7 @@ public class PrepareProjectResultsAssociation {
|
||||||
+ " FROM relation "
|
+ " FROM relation "
|
||||||
+ " WHERE datainfo.deletedbyinference = false "
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
+ " AND relClass = '"
|
+ " AND relClass = '"
|
||||||
+ RELATION_RESULT_PROJECT_REL_CLASS
|
+ ModelConstants.IS_PRODUCED_BY
|
||||||
+ "'";
|
+ "'";
|
||||||
|
|
||||||
Dataset<Row> resproj_relation = spark.sql(resproj_relation_query);
|
Dataset<Row> resproj_relation = spark.sql(resproj_relation_query);
|
||||||
|
|
|
@ -20,6 +20,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
@ -122,9 +123,9 @@ public class SparkResultToProjectThroughSemRelJob {
|
||||||
getRelation(
|
getRelation(
|
||||||
resId,
|
resId,
|
||||||
projectId,
|
projectId,
|
||||||
RELATION_RESULT_PROJECT_REL_CLASS,
|
ModelConstants.IS_PRODUCED_BY,
|
||||||
RELATION_RESULTPROJECT_REL_TYPE,
|
ModelConstants.RESULT_PROJECT,
|
||||||
RELATION_RESULTPROJECT_SUBREL_TYPE,
|
ModelConstants.OUTCOME,
|
||||||
PROPAGATION_DATA_INFO_TYPE,
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
||||||
|
@ -133,9 +134,9 @@ public class SparkResultToProjectThroughSemRelJob {
|
||||||
getRelation(
|
getRelation(
|
||||||
projectId,
|
projectId,
|
||||||
resId,
|
resId,
|
||||||
RELATION_PROJECT_RESULT_REL_CLASS,
|
ModelConstants.PRODUCES,
|
||||||
RELATION_RESULTPROJECT_REL_TYPE,
|
ModelConstants.RESULT_PROJECT,
|
||||||
RELATION_RESULTPROJECT_SUBREL_TYPE,
|
ModelConstants.OUTCOME,
|
||||||
PROPAGATION_DATA_INFO_TYPE,
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
||||||
|
|
|
@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
public class PrepareResultCommunitySet {
|
public class PrepareResultCommunitySet {
|
||||||
|
@ -55,9 +56,7 @@ public class PrepareResultCommunitySet {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
|
||||||
}
|
|
||||||
prepareInfo(spark, inputPath, outputPath, organizationMap);
|
prepareInfo(spark, inputPath, outputPath, organizationMap);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -76,13 +75,13 @@ public class PrepareResultCommunitySet {
|
||||||
+ " FROM relation "
|
+ " FROM relation "
|
||||||
+ " WHERE datainfo.deletedbyinference = false "
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
+ " AND relClass = '"
|
+ " AND relClass = '"
|
||||||
+ RELATION_RESULT_ORGANIZATION_REL_CLASS
|
+ ModelConstants.HAS_AUTHOR_INSTITUTION
|
||||||
+ "') result_organization "
|
+ "') result_organization "
|
||||||
+ "LEFT JOIN (SELECT source, collect_set(target) org_set "
|
+ "LEFT JOIN (SELECT source, collect_set(target) org_set "
|
||||||
+ " FROM relation "
|
+ " FROM relation "
|
||||||
+ " WHERE datainfo.deletedbyinference = false "
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
+ " AND relClass = '"
|
+ " AND relClass = '"
|
||||||
+ RELATION_REPRESENTATIVERESULT_RESULT_CLASS
|
+ ModelConstants.MERGES
|
||||||
+ "' "
|
+ "' "
|
||||||
+ " GROUP BY source) organization_organization "
|
+ " GROUP BY source) organization_organization "
|
||||||
+ "ON result_organization.target = organization_organization.source ";
|
+ "ON result_organization.target = organization_organization.source ";
|
||||||
|
|
|
@ -68,11 +68,10 @@ public class SparkResultToCommunityFromOrganizationJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
if (saveGraph) {
|
||||||
}
|
|
||||||
if (saveGraph)
|
|
||||||
execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
|
execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -53,9 +53,7 @@ public class PrepareResultCommunitySetStep2 {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
|
||||||
}
|
|
||||||
mergeInfo(spark, inputPath, outputPath);
|
mergeInfo(spark, inputPath, outputPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
@ -58,30 +59,15 @@ public class PrepareResultInstRepoAssociation {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
readNeededResources(spark, inputPath);
|
readNeededResources(spark, inputPath);
|
||||||
|
|
||||||
|
removeOutputDir(spark, datasourceOrganizationPath);
|
||||||
prepareDatasourceOrganization(spark, datasourceOrganizationPath);
|
prepareDatasourceOrganization(spark, datasourceOrganizationPath);
|
||||||
|
|
||||||
|
removeOutputDir(spark, alreadyLinkedPath);
|
||||||
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
|
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void prepareAlreadyLinkedAssociation(
|
|
||||||
SparkSession spark, String alreadyLinkedPath) {
|
|
||||||
String query = "Select source resultId, collect_set(target) organizationSet "
|
|
||||||
+ "from relation "
|
|
||||||
+ "where datainfo.deletedbyinference = false "
|
|
||||||
+ "and relClass = '"
|
|
||||||
+ RELATION_RESULT_ORGANIZATION_REL_CLASS
|
|
||||||
+ "' "
|
|
||||||
+ "group by source";
|
|
||||||
|
|
||||||
spark
|
|
||||||
.sql(query)
|
|
||||||
.as(Encoders.bean(ResultOrganizationSet.class))
|
|
||||||
// TODO retry to stick with datasets
|
|
||||||
.toJavaRDD()
|
|
||||||
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
|
||||||
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void readNeededResources(SparkSession spark, String inputPath) {
|
private static void readNeededResources(SparkSession spark, String inputPath) {
|
||||||
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||||
datasource.createOrReplaceTempView("datasource");
|
datasource.createOrReplaceTempView("datasource");
|
||||||
|
@ -106,7 +92,7 @@ public class PrepareResultInstRepoAssociation {
|
||||||
+ "JOIN ( SELECT source, target "
|
+ "JOIN ( SELECT source, target "
|
||||||
+ "FROM relation "
|
+ "FROM relation "
|
||||||
+ "WHERE relclass = '"
|
+ "WHERE relclass = '"
|
||||||
+ RELATION_DATASOURCE_ORGANIZATION_REL_CLASS
|
+ ModelConstants.IS_PROVIDED_BY
|
||||||
+ "' "
|
+ "' "
|
||||||
+ "AND datainfo.deletedbyinference = false ) rel "
|
+ "AND datainfo.deletedbyinference = false ) rel "
|
||||||
+ "ON d.id = rel.source ";
|
+ "ON d.id = rel.source ";
|
||||||
|
@ -119,4 +105,24 @@ public class PrepareResultInstRepoAssociation {
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(datasourceOrganizationPath);
|
.json(datasourceOrganizationPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void prepareAlreadyLinkedAssociation(
|
||||||
|
SparkSession spark, String alreadyLinkedPath) {
|
||||||
|
String query = "Select source resultId, collect_set(target) organizationSet "
|
||||||
|
+ "from relation "
|
||||||
|
+ "where datainfo.deletedbyinference = false "
|
||||||
|
+ "and relClass = '"
|
||||||
|
+ ModelConstants.HAS_AUTHOR_INSTITUTION
|
||||||
|
+ "' "
|
||||||
|
+ "group by source";
|
||||||
|
|
||||||
|
spark
|
||||||
|
.sql(query)
|
||||||
|
.as(Encoders.bean(ResultOrganizationSet.class))
|
||||||
|
// TODO retry to stick with datasets
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
||||||
|
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
@ -83,10 +84,8 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
if (isTest(parser)) {
|
// removeOutputDir(spark, outputPath);
|
||||||
removeOutputDir(spark, outputPath);
|
if (saveGraph) {
|
||||||
}
|
|
||||||
if (saveGraph)
|
|
||||||
execPropagation(
|
execPropagation(
|
||||||
spark,
|
spark,
|
||||||
datasourceorganization,
|
datasourceorganization,
|
||||||
|
@ -94,6 +93,7 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
inputPath,
|
inputPath,
|
||||||
outputPath,
|
outputPath,
|
||||||
resultClazz);
|
resultClazz);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -151,9 +151,9 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
getRelation(
|
getRelation(
|
||||||
orgId,
|
orgId,
|
||||||
resultId,
|
resultId,
|
||||||
RELATION_ORGANIZATION_RESULT_REL_CLASS,
|
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
|
||||||
RELATION_RESULTORGANIZATION_REL_TYPE,
|
ModelConstants.RESULT_ORGANIZATION,
|
||||||
RELATION_RESULTORGANIZATION_SUBREL_TYPE,
|
ModelConstants.AFFILIATION,
|
||||||
PROPAGATION_DATA_INFO_TYPE,
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
|
||||||
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
|
||||||
|
@ -162,9 +162,9 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
getRelation(
|
getRelation(
|
||||||
resultId,
|
resultId,
|
||||||
orgId,
|
orgId,
|
||||||
RELATION_RESULT_ORGANIZATION_REL_CLASS,
|
ModelConstants.HAS_AUTHOR_INSTITUTION,
|
||||||
RELATION_RESULTORGANIZATION_REL_TYPE,
|
ModelConstants.RESULT_ORGANIZATION,
|
||||||
RELATION_RESULTORGANIZATION_SUBREL_TYPE,
|
ModelConstants.AFFILIATION,
|
||||||
PROPAGATION_DATA_INFO_TYPE,
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID,
|
||||||
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
|
PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME));
|
||||||
|
|
|
@ -18,6 +18,17 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -42,8 +53,6 @@
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -53,8 +62,6 @@
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -64,8 +71,6 @@
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -75,8 +80,6 @@
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasources">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -95,8 +98,6 @@
|
||||||
|
|
||||||
<action name="join_bulktag_publication">
|
<action name="join_bulktag_publication">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>bulkTagging-publication</name>
|
<name>bulkTagging-publication</name>
|
||||||
|
@ -124,8 +125,6 @@
|
||||||
|
|
||||||
<action name="join_bulktag_dataset">
|
<action name="join_bulktag_dataset">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>bulkTagging-dataset</name>
|
<name>bulkTagging-dataset</name>
|
||||||
|
@ -153,8 +152,6 @@
|
||||||
|
|
||||||
<action name="join_bulktag_otherresearchproduct">
|
<action name="join_bulktag_otherresearchproduct">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>bulkTagging-orp</name>
|
<name>bulkTagging-orp</name>
|
||||||
|
@ -182,8 +179,6 @@
|
||||||
|
|
||||||
<action name="join_bulktag_software">
|
<action name="join_bulktag_software">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>bulkTagging-software</name>
|
<name>bulkTagging-software</name>
|
||||||
|
|
|
@ -19,6 +19,17 @@
|
||||||
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -43,8 +54,6 @@
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -54,18 +63,15 @@
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
<ok to="copy_wait"/>
|
<ok to="copy_wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -75,8 +81,6 @@
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasources">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
|
|
@ -57,6 +57,7 @@
|
||||||
<ok to="copy_wait"/>
|
<ok to="copy_wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
@ -81,7 +82,6 @@
|
||||||
|
|
||||||
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
|
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
|
||||||
|
|
||||||
|
|
||||||
<fork name="fork_prepare_assoc_step1">
|
<fork name="fork_prepare_assoc_step1">
|
||||||
<path start="join_prepare_publication"/>
|
<path start="join_prepare_publication"/>
|
||||||
<path start="join_prepare_dataset"/>
|
<path start="join_prepare_dataset"/>
|
||||||
|
@ -230,8 +230,8 @@
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork-join-exec-propagation"/>
|
<ok to="fork-join-exec-propagation"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<fork name="fork-join-exec-propagation">
|
<fork name="fork-join-exec-propagation">
|
||||||
<path start="join_propagate_publication"/>
|
<path start="join_propagate_publication"/>
|
||||||
<path start="join_propagate_dataset"/>
|
<path start="join_propagate_dataset"/>
|
||||||
|
@ -271,6 +271,7 @@
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_propagate_dataset">
|
<action name="join_propagate_dataset">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -302,6 +303,7 @@
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_propagate_otherresearchproduct">
|
<action name="join_propagate_otherresearchproduct">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -333,6 +335,7 @@
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_propagate_software">
|
<action name="join_propagate_software">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
|
|
@ -14,6 +14,17 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -42,8 +53,6 @@
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -53,8 +62,6 @@
|
||||||
|
|
||||||
<action name="copy_publication">
|
<action name="copy_publication">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/publication</arg>
|
<arg>${nameNode}/${sourcePath}/publication</arg>
|
||||||
<arg>${nameNode}/${outputPath}/publication</arg>
|
<arg>${nameNode}/${outputPath}/publication</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -64,8 +71,6 @@
|
||||||
|
|
||||||
<action name="copy_dataset">
|
<action name="copy_dataset">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
||||||
<arg>${nameNode}/${outputPath}/dataset</arg>
|
<arg>${nameNode}/${outputPath}/dataset</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -75,8 +80,6 @@
|
||||||
|
|
||||||
<action name="copy_orp">
|
<action name="copy_orp">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -86,28 +89,24 @@
|
||||||
|
|
||||||
<action name="copy_software">
|
<action name="copy_software">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/software</arg>
|
<arg>${nameNode}/${sourcePath}/software</arg>
|
||||||
<arg>${nameNode}/${outputPath}/software</arg>
|
<arg>${nameNode}/${outputPath}/software</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -117,8 +116,6 @@
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasources">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
|
|
@ -14,6 +14,17 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -38,8 +49,6 @@
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -49,8 +58,6 @@
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -60,8 +67,6 @@
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -71,8 +76,6 @@
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasources">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -101,8 +104,8 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--organizationtoresultcommunitymap</arg><arg>${organizationtoresultcommunitymap}</arg>
|
<arg>--organizationtoresultcommunitymap</arg><arg>${organizationtoresultcommunitymap}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork-join-exec-propagation"/>
|
<ok to="fork-join-exec-propagation"/>
|
||||||
|
@ -136,9 +139,9 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
|
@ -165,9 +168,9 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
|
@ -194,9 +197,9 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
|
@ -223,9 +226,9 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
|
|
|
@ -10,6 +10,17 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
<start to="reset_outputpath"/>
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
@ -38,8 +49,6 @@
|
||||||
|
|
||||||
<action name="copy_relation">
|
<action name="copy_relation">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -49,8 +58,6 @@
|
||||||
|
|
||||||
<action name="copy_publication">
|
<action name="copy_publication">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/publication</arg>
|
<arg>${nameNode}/${sourcePath}/publication</arg>
|
||||||
<arg>${nameNode}/${outputPath}/publication</arg>
|
<arg>${nameNode}/${outputPath}/publication</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -60,8 +67,6 @@
|
||||||
|
|
||||||
<action name="copy_dataset">
|
<action name="copy_dataset">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
||||||
<arg>${nameNode}/${outputPath}/dataset</arg>
|
<arg>${nameNode}/${outputPath}/dataset</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -71,8 +76,6 @@
|
||||||
|
|
||||||
<action name="copy_orp">
|
<action name="copy_orp">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -82,8 +85,6 @@
|
||||||
|
|
||||||
<action name="copy_software">
|
<action name="copy_software">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/software</arg>
|
<arg>${nameNode}/${sourcePath}/software</arg>
|
||||||
<arg>${nameNode}/${outputPath}/software</arg>
|
<arg>${nameNode}/${outputPath}/software</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -93,8 +94,6 @@
|
||||||
|
|
||||||
<action name="copy_organization">
|
<action name="copy_organization">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -104,8 +103,6 @@
|
||||||
|
|
||||||
<action name="copy_projects">
|
<action name="copy_projects">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
<arg>${nameNode}/${outputPath}/project</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -115,8 +112,6 @@
|
||||||
|
|
||||||
<action name="copy_datasources">
|
<action name="copy_datasources">
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||||
</distcp>
|
</distcp>
|
||||||
|
@ -125,6 +120,7 @@
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait" to="prepare_result_organization_association"/>
|
<join name="wait" to="prepare_result_organization_association"/>
|
||||||
|
|
||||||
<action name="prepare_result_organization_association">
|
<action name="prepare_result_organization_association">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -176,12 +172,12 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -206,12 +202,12 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -236,12 +232,12 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -266,12 +262,12 @@
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||||
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
|
||||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||||
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait2"/>
|
<ok to="wait2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -0,0 +1,79 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.hive;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelSupport.tableIdentifier;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
|
||||||
|
public class GraphHiveTableImporterJob {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class);
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
GraphHiveTableImporterJob.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("inputPath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
String hiveDbName = parser.get("hiveDbName");
|
||||||
|
log.info("hiveDbName: {}", hiveDbName);
|
||||||
|
|
||||||
|
final String className = parser.get("className");
|
||||||
|
log.info("className: {}", className);
|
||||||
|
|
||||||
|
Class<? extends Oaf> clazz = (Class<? extends Oaf>) Class.forName(className);
|
||||||
|
|
||||||
|
String hiveMetastoreUris = parser.get("hiveMetastoreUris");
|
||||||
|
log.info("hiveMetastoreUris: {}", hiveMetastoreUris);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", hiveMetastoreUris);
|
||||||
|
|
||||||
|
runWithSparkHiveSession(
|
||||||
|
conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
// protected for testing
|
||||||
|
private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
|
||||||
|
Class<T> clazz) {
|
||||||
|
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.saveAsTable(tableIdentifier(hiveDbName, clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -127,7 +127,6 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final List<Oaf> oafs = new ArrayList<>();
|
final List<Oaf> oafs = new ArrayList<>();
|
||||||
|
|
||||||
switch (type.toLowerCase()) {
|
switch (type.toLowerCase()) {
|
||||||
case "":
|
|
||||||
case "publication":
|
case "publication":
|
||||||
final Publication p = new Publication();
|
final Publication p = new Publication();
|
||||||
populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||||
|
@ -138,7 +137,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
case "dataset":
|
case "dataset":
|
||||||
final Dataset d = new Dataset();
|
final Dataset d = new Dataset();
|
||||||
populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||||
d.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
|
d.setResulttype(DATASET_DEFAULT_RESULTTYPE);
|
||||||
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
||||||
d.setDevice(prepareDatasetDevice(doc, info));
|
d.setDevice(prepareDatasetDevice(doc, info));
|
||||||
d.setSize(prepareDatasetSize(doc, info));
|
d.setSize(prepareDatasetSize(doc, info));
|
||||||
|
@ -158,6 +157,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
|
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
|
||||||
oafs.add(s);
|
oafs.add(s);
|
||||||
break;
|
break;
|
||||||
|
case "":
|
||||||
case "otherresearchproducts":
|
case "otherresearchproducts":
|
||||||
default:
|
default:
|
||||||
final OtherResearchProduct o = new OtherResearchProduct();
|
final OtherResearchProduct o = new OtherResearchProduct();
|
||||||
|
|
|
@ -50,8 +50,7 @@ import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
|
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
|
||||||
implements Closeable {
|
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class);
|
private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class);
|
||||||
|
|
||||||
|
@ -128,9 +127,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Oaf> processDatasource(final ResultSet rs) {
|
public List<Oaf> processDatasource(final ResultSet rs) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
||||||
final DataInfo info = prepareDataInfo(rs);
|
final DataInfo info = prepareDataInfo(rs);
|
||||||
|
|
||||||
final Datasource ds = new Datasource();
|
final Datasource ds = new Datasource();
|
||||||
|
@ -194,7 +191,6 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
|
||||||
|
|
||||||
public List<Oaf> processProject(final ResultSet rs) {
|
public List<Oaf> processProject(final ResultSet rs) {
|
||||||
try {
|
try {
|
||||||
|
|
||||||
final DataInfo info = prepareDataInfo(rs);
|
final DataInfo info = prepareDataInfo(rs);
|
||||||
|
|
||||||
final Project p = new Project();
|
final Project p = new Project();
|
||||||
|
@ -249,9 +245,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Oaf> processOrganization(final ResultSet rs) {
|
public List<Oaf> processOrganization(final ResultSet rs) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
||||||
final DataInfo info = prepareDataInfo(rs);
|
final DataInfo info = prepareDataInfo(rs);
|
||||||
|
|
||||||
final Organization o = new Organization();
|
final Organization o = new Organization();
|
||||||
|
@ -370,14 +364,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
|
||||||
|
|
||||||
final DataInfo info = dataInfo(
|
final DataInfo info = dataInfo(
|
||||||
false, null, false, false,
|
false, null, false, false,
|
||||||
|
|
||||||
qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9");
|
qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9");
|
||||||
|
|
||||||
final List<KeyValue> collectedFrom = listKeyValues(
|
final List<KeyValue> collectedFrom = listKeyValues(
|
||||||
createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE");
|
createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
||||||
if (rs.getString(SOURCE_TYPE).equals("context")) {
|
if (rs.getString(SOURCE_TYPE).equals("context")) {
|
||||||
final Result r;
|
final Result r;
|
||||||
|
|
||||||
|
@ -461,9 +453,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
|
||||||
final Boolean inferred = rs.getBoolean("inferred");
|
final Boolean inferred = rs.getBoolean("inferred");
|
||||||
final String trust = rs.getString("trust");
|
final String trust = rs.getString("trust");
|
||||||
return dataInfo(
|
return dataInfo(
|
||||||
|
deletedbyinference,
|
||||||
deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
|
inferenceprovenance,
|
||||||
|
inferred,
|
||||||
|
false,
|
||||||
|
ENTITYREGISTRY_PROVENANCE_ACTION,
|
||||||
|
trust);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Qualifier prepareQualifierSplitting(final String s) {
|
private Qualifier prepareQualifierSplitting(final String s) {
|
||||||
|
@ -535,4 +530,5 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
|
||||||
super.close();
|
super.close();
|
||||||
dbClient.close();
|
dbClient.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -10,11 +9,13 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
|
import org.dom4j.Element;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson;
|
import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
@ -28,15 +29,26 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
final List<Author> res = new ArrayList<>();
|
final List<Author> res = new ArrayList<>();
|
||||||
int pos = 1;
|
int pos = 1;
|
||||||
for (final Object o : doc.selectNodes("//dc:creator")) {
|
for (final Object o : doc.selectNodes("//dc:creator")) {
|
||||||
final Node n = (Node) o;
|
final Element e = (Element) o;
|
||||||
final Author author = new Author();
|
final Author author = new Author();
|
||||||
author.setFullname(n.getText());
|
author.setFullname(e.getText());
|
||||||
author.setRank(pos++);
|
author.setRank(pos++);
|
||||||
final PacePerson p = new PacePerson(n.getText(), false);
|
final PacePerson p = new PacePerson(e.getText(), false);
|
||||||
if (p.isAccurate()) {
|
if (p.isAccurate()) {
|
||||||
author.setName(p.getNormalisedFirstName());
|
author.setName(p.getNormalisedFirstName());
|
||||||
author.setSurname(p.getNormalisedSurname());
|
author.setSurname(p.getNormalisedSurname());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final String pid = e.attributeValue("nameIdentifier");
|
||||||
|
final String pidType = e.attributeValue("nameIdentifierScheme");
|
||||||
|
|
||||||
|
author.setPid(new ArrayList<>());
|
||||||
|
if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) {
|
||||||
|
author
|
||||||
|
.getPid()
|
||||||
|
.add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info));
|
||||||
|
}
|
||||||
|
|
||||||
res.add(author);
|
res.add(author);
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
|
|
@ -12,6 +12,7 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
@ -44,20 +45,35 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
for (final Object o : doc.selectNodes("//datacite:creator")) {
|
for (final Object o : doc.selectNodes("//datacite:creator")) {
|
||||||
final Node n = (Node) o;
|
final Node n = (Node) o;
|
||||||
final Author author = new Author();
|
final Author author = new Author();
|
||||||
author.setFullname(n.valueOf("./datacite:creatorName"));
|
final String fullname = n.valueOf("./datacite:creatorName");
|
||||||
author.setName(n.valueOf("./datacite:givenName"));
|
author.setFullname(fullname);
|
||||||
author.setSurname(n.valueOf("./datacite:familyName"));
|
|
||||||
author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info));
|
PacePerson pp = new PacePerson(fullname, false);
|
||||||
author.setPid(preparePids(doc, info));
|
final String name = n.valueOf("./datacite:givenName");
|
||||||
|
if (StringUtils.isBlank(name) & pp.isAccurate()) {
|
||||||
|
author.setName(pp.getNormalisedFirstName());
|
||||||
|
} else {
|
||||||
|
author.setName(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
final String surname = n.valueOf("./datacite:familyName");
|
||||||
|
if (StringUtils.isBlank(surname) & pp.isAccurate()) {
|
||||||
|
author.setSurname(pp.getNormalisedSurname());
|
||||||
|
} else {
|
||||||
|
author.setSurname(surname);
|
||||||
|
}
|
||||||
|
|
||||||
|
author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info));
|
||||||
|
author.setPid(preparePids(n, info));
|
||||||
author.setRank(pos++);
|
author.setRank(pos++);
|
||||||
res.add(author);
|
res.add(author);
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<StructuredProperty> preparePids(final Document doc, final DataInfo info) {
|
private List<StructuredProperty> preparePids(final Node n, final DataInfo info) {
|
||||||
final List<StructuredProperty> res = new ArrayList<>();
|
final List<StructuredProperty> res = new ArrayList<>();
|
||||||
for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) {
|
for (final Object o : n.selectNodes("./datacite:nameIdentifier")) {
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
structuredProperty(
|
structuredProperty(
|
||||||
|
@ -77,8 +93,6 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
final KeyValue hostedby) {
|
final KeyValue hostedby) {
|
||||||
|
|
||||||
final Instance instance = new Instance();
|
final Instance instance = new Instance();
|
||||||
final Set<String> url = new HashSet<>();
|
|
||||||
instance.setUrl(new ArrayList<>());
|
|
||||||
instance
|
instance
|
||||||
.setInstancetype(
|
.setInstancetype(
|
||||||
prepareQualifier(
|
prepareQualifier(
|
||||||
|
@ -97,6 +111,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
.setProcessingchargecurrency(
|
.setProcessingchargecurrency(
|
||||||
field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||||
|
|
||||||
|
final Set<String> url = new HashSet<>();
|
||||||
for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
|
for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
|
||||||
url.add(((Node) o).getText().trim());
|
url.add(((Node) o).getText().trim());
|
||||||
}
|
}
|
||||||
|
@ -109,7 +124,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='DOI']")) {
|
for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='DOI']")) {
|
||||||
url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim());
|
url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim());
|
||||||
}
|
}
|
||||||
instance.getUrl().addAll(url);
|
if (!url.isEmpty()) {
|
||||||
|
instance.setUrl(new ArrayList<>());
|
||||||
|
instance.getUrl().addAll(url);
|
||||||
|
}
|
||||||
return Arrays.asList(instance);
|
return Arrays.asList(instance);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||||
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
DROP DATABASE IF EXISTS ${hiveDbName} CASCADE;
|
||||||
|
CREATE DATABASE ${hiveDbName};
|
|
@ -72,18 +72,45 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<start to="MapGraphAsHiveDB"/>
|
<start to="reset_DB"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<action name="MapGraphAsHiveDB">
|
<action name="reset_DB">
|
||||||
|
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>hive.metastore.uris</name>
|
||||||
|
<value>${hiveMetastoreUris}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<jdbc-url>${hiveJdbcUrl}/${hiveDbName}</jdbc-url>
|
||||||
|
<script>lib/scripts/reset_db.sql</script>
|
||||||
|
<param>hiveDbName=${hiveDbName}</param>
|
||||||
|
</hive2>
|
||||||
|
<ok to="fork_import"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name="fork_import">
|
||||||
|
<path start="import_publication"/>
|
||||||
|
<path start="import_dataset"/>
|
||||||
|
<path start="import_orp"/>
|
||||||
|
<path start="import_software"/>
|
||||||
|
<path start="import_datasource"/>
|
||||||
|
<path start="import_organization"/>
|
||||||
|
<path start="import_project"/>
|
||||||
|
<path start="import_relation"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="import_publication">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>MapGraphAsHiveDB</name>
|
<name>Import table publication</name>
|
||||||
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveImporterJob</class>
|
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -95,18 +122,201 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
|
||||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="PostProcessing"/>
|
<ok to="join_import"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="import_dataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Import table dataset</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
|
||||||
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="join_import"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="import_orp">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Import table otherresearchproduct</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
|
||||||
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="join_import"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="import_software">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Import table software</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
|
||||||
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="join_import"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="import_datasource">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Import table datasource</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${inputPath}/datasource</arg>
|
||||||
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="join_import"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="import_organization">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Import table organization</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${inputPath}/organization</arg>
|
||||||
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="join_import"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="import_project">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Import table project</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${inputPath}/project</arg>
|
||||||
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="join_import"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="import_relation">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Import table project</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${inputPath}/relation</arg>
|
||||||
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="join_import"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="join_import" to="PostProcessing"/>
|
||||||
|
|
||||||
<action name="PostProcessing">
|
<action name="PostProcessing">
|
||||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<configuration>
|
<configuration>
|
||||||
<property>
|
<property>
|
||||||
<name>hive.metastore.uris</name>
|
<name>hive.metastore.uris</name>
|
||||||
|
@ -122,4 +332,5 @@
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -0,0 +1,26 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "issm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "when true will stop SparkSession after job execution",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "in",
|
||||||
|
"paramLongName": "inputPath",
|
||||||
|
"paramDescription": "the path to the graph data dump to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "hmu",
|
||||||
|
"paramLongName": "hiveMetastoreUris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "db",
|
||||||
|
"paramLongName": "hiveDbName",
|
||||||
|
"paramDescription": "the target hive database name",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,32 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "issm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "when true will stop SparkSession after job execution",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "in",
|
||||||
|
"paramLongName": "inputPath",
|
||||||
|
"paramDescription": "the path to the graph data dump to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "hmu",
|
||||||
|
"paramLongName": "hiveMetastoreUris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "db",
|
||||||
|
"paramLongName": "hiveDbName",
|
||||||
|
"paramDescription": "the target hive database name",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "tn",
|
||||||
|
"paramLongName": "className",
|
||||||
|
"paramDescription": "the class modelling the target table",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -1,10 +0,0 @@
|
||||||
[
|
|
||||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
|
||||||
{"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true},
|
|
||||||
{"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true},
|
|
||||||
{"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true},
|
|
||||||
{"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true},
|
|
||||||
{"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true},
|
|
||||||
{"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true},
|
|
||||||
{"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true}
|
|
||||||
]
|
|
|
@ -1,20 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"paramName": "mt",
|
|
||||||
"paramLongName": "master",
|
|
||||||
"paramDescription": "should be local or yarn",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "is",
|
|
||||||
"paramLongName": "isLookupUrl",
|
|
||||||
"paramDescription": "URL of the isLookUp Service",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "i",
|
|
||||||
"paramLongName": "inputPaths",
|
|
||||||
"paramDescription": "URL of the isLookUp Service",
|
|
||||||
"paramRequired": true
|
|
||||||
}
|
|
||||||
]
|
|
|
@ -10,6 +10,7 @@ import static org.mockito.Mockito.when;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
@ -19,11 +20,8 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
import org.mockito.Mock;
|
import org.mockito.Mock;
|
||||||
import org.mockito.junit.jupiter.MockitoExtension;
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
public class MappersTest {
|
public class MappersTest {
|
||||||
|
@ -54,7 +52,29 @@ public class MappersTest {
|
||||||
assertValidId(p.getId());
|
assertValidId(p.getId());
|
||||||
assertValidId(p.getCollectedfrom().get(0).getKey());
|
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||||
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||||
|
|
||||||
assertTrue(p.getAuthor().size() > 0);
|
assertTrue(p.getAuthor().size() > 0);
|
||||||
|
Optional<Author> author = p
|
||||||
|
.getAuthor()
|
||||||
|
.stream()
|
||||||
|
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
||||||
|
.findFirst();
|
||||||
|
assertTrue(author.isPresent());
|
||||||
|
StructuredProperty pid = author
|
||||||
|
.get()
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.findFirst()
|
||||||
|
.get();
|
||||||
|
assertEquals("0000-0001-6651-1178", pid.getValue());
|
||||||
|
assertEquals("ORCID", pid.getQualifier().getClassid());
|
||||||
|
assertEquals("ORCID", pid.getQualifier().getClassname());
|
||||||
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
||||||
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
||||||
|
assertEquals("Votsi,Nefta", author.get().getFullname());
|
||||||
|
assertEquals("Votsi", author.get().getSurname());
|
||||||
|
assertEquals("Nefta", author.get().getName());
|
||||||
|
|
||||||
assertTrue(p.getSubject().size() > 0);
|
assertTrue(p.getSubject().size() > 0);
|
||||||
assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline()));
|
assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline()));
|
||||||
assertTrue(StringUtils.isNotBlank(p.getJournal().getName()));
|
assertTrue(StringUtils.isNotBlank(p.getJournal().getName()));
|
||||||
|
@ -100,6 +120,38 @@ public class MappersTest {
|
||||||
assertValidId(d.getCollectedfrom().get(0).getKey());
|
assertValidId(d.getCollectedfrom().get(0).getKey());
|
||||||
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
|
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
|
||||||
assertTrue(d.getAuthor().size() > 0);
|
assertTrue(d.getAuthor().size() > 0);
|
||||||
|
|
||||||
|
Optional<Author> author = d
|
||||||
|
.getAuthor()
|
||||||
|
.stream()
|
||||||
|
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
||||||
|
.findFirst();
|
||||||
|
assertTrue(author.isPresent());
|
||||||
|
StructuredProperty pid = author
|
||||||
|
.get()
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.findFirst()
|
||||||
|
.get();
|
||||||
|
assertEquals("0000-0001-9074-1619", pid.getValue());
|
||||||
|
assertEquals("ORCID", pid.getQualifier().getClassid());
|
||||||
|
assertEquals("ORCID", pid.getQualifier().getClassname());
|
||||||
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
||||||
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
||||||
|
assertEquals("Baracchini, Theo", author.get().getFullname());
|
||||||
|
assertEquals("Baracchini", author.get().getSurname());
|
||||||
|
assertEquals("Theo", author.get().getName());
|
||||||
|
|
||||||
|
assertEquals(1, author.get().getAffiliation().size());
|
||||||
|
Optional<Field<String>> opAff = author
|
||||||
|
.get()
|
||||||
|
.getAffiliation()
|
||||||
|
.stream()
|
||||||
|
.findFirst();
|
||||||
|
assertTrue(opAff.isPresent());
|
||||||
|
Field<String> affiliation = opAff.get();
|
||||||
|
assertEquals("ISTI-CNR", affiliation.getValue());
|
||||||
|
|
||||||
assertTrue(d.getSubject().size() > 0);
|
assertTrue(d.getSubject().size() > 0);
|
||||||
assertTrue(d.getInstance().size() > 0);
|
assertTrue(d.getInstance().size() > 0);
|
||||||
assertTrue(d.getContext().size() > 0);
|
assertTrue(d.getContext().size() > 0);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
<metadata xmlns="http://namespace.openaire.eu/">
|
<metadata xmlns="http://namespace.openaire.eu/">
|
||||||
<dc:title>Ecosystem Service capacity is higher in areas of multiple designation types</dc:title>
|
<dc:title>Ecosystem Service capacity is higher in areas of multiple designation types</dc:title>
|
||||||
<dc:creator>Nikolaidou,Charitini</dc:creator>
|
<dc:creator>Nikolaidou,Charitini</dc:creator>
|
||||||
<dc:creator>Votsi,Nefta</dc:creator>
|
<dc:creator nameIdentifier="0000-0001-6651-1178" nameIdentifierScheme="ORCID">Votsi,Nefta</dc:creator>
|
||||||
<dc:creator>Sgardelis,Steanos</dc:creator>
|
<dc:creator>Sgardelis,Steanos</dc:creator>
|
||||||
<dc:creator>Halley,John</dc:creator>
|
<dc:creator>Halley,John</dc:creator>
|
||||||
<dc:creator>Pantis,John</dc:creator>
|
<dc:creator>Pantis,John</dc:creator>
|
||||||
|
|
|
@ -35,9 +35,10 @@
|
||||||
</creator>
|
</creator>
|
||||||
<creator>
|
<creator>
|
||||||
<creatorName>Baracchini, Theo</creatorName>
|
<creatorName>Baracchini, Theo</creatorName>
|
||||||
|
<nameIdentifier nameIdentifierScheme="ORCID">0000-0001-9074-1619</nameIdentifier>
|
||||||
<givenName>Theo</givenName>
|
<givenName>Theo</givenName>
|
||||||
<familyName>Baracchini</familyName>
|
<familyName>Baracchini</familyName>
|
||||||
<affiliation>Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland</affiliation>
|
<affiliation>ISTI-CNR</affiliation>
|
||||||
</creator>
|
</creator>
|
||||||
<creator>
|
<creator>
|
||||||
<creatorName>Wüest, Alfred</creatorName>
|
<creatorName>Wüest, Alfred</creatorName>
|
||||||
|
|
Loading…
Reference in New Issue