forked from D-Net/dnet-hadoop
merge upstream
This commit is contained in:
commit
a634794242
|
@ -57,9 +57,9 @@
|
|||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dnet-openaire-broker-common</artifactId>
|
||||
<version>[3.0.4,4.0.0)</version>
|
||||
<version>[3.0.0-SNAPSHOT,)</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class CheckDuplictedIdsJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CheckDuplictedIdsJob.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
CheckDuplictedIdsJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String countPath = parser.get("workingPath") + "/counts";
|
||||
log.info("countPath: {}", countPath);
|
||||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("invaild_event_id");
|
||||
|
||||
final TypedColumn<Tuple2<String, Long>, Tuple2<String, Long>> agg = new CountAggregator().toColumn();
|
||||
|
||||
ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.map(e -> new Tuple2<>(e.getEventId(), 1l), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.groupByKey(t -> t._1, Encoders.STRING())
|
||||
.agg(agg)
|
||||
.map(t -> t._2, Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.filter(t -> t._2 > 1)
|
||||
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(countPath);
|
||||
;
|
||||
|
||||
}
|
||||
|
||||
private static String eventAsJsonString(final Event f) throws JsonProcessingException {
|
||||
return new ObjectMapper().writeValueAsString(f);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class CountAggregator extends Aggregator<Tuple2<String, Long>, Tuple2<String, Long>, Tuple2<String, Long>> {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1395935985734672538L;
|
||||
|
||||
@Override
|
||||
public Encoder<Tuple2<String, Long>> bufferEncoder() {
|
||||
return Encoders.tuple(Encoders.STRING(), Encoders.LONG());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> finish(final Tuple2<String, Long> arg0) {
|
||||
return arg0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> merge(final Tuple2<String, Long> arg0, final Tuple2<String, Long> arg1) {
|
||||
final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1);
|
||||
return new Tuple2<>(s, arg0._2 + arg1._2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<Tuple2<String, Long>> outputEncoder() {
|
||||
return Encoders.tuple(Encoders.STRING(), Encoders.LONG());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> reduce(final Tuple2<String, Long> arg0, final Tuple2<String, Long> arg1) {
|
||||
final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1);
|
||||
return new Tuple2<>(s, arg0._2 + arg1._2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> zero() {
|
||||
return new Tuple2<>(null, 0l);
|
||||
}
|
||||
|
||||
}
|
|
@ -3,8 +3,6 @@ package eu.dnetlib.dhp.broker.oa;
|
|||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
@ -63,13 +61,13 @@ public class GenerateEventsJob {
|
|||
final String eventsPath = workingPath + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final Set<String> dsIdWhitelist = parseParamAsList(parser, "datasourceIdWhitelist");
|
||||
final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
|
||||
log.info("datasourceIdWhitelist: {}", StringUtils.join(dsIdWhitelist, ","));
|
||||
|
||||
final Set<String> dsTypeWhitelist = parseParamAsList(parser, "datasourceTypeWhitelist");
|
||||
final Set<String> dsTypeWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceTypeWhitelist");
|
||||
log.info("datasourceTypeWhitelist: {}", StringUtils.join(dsTypeWhitelist, ","));
|
||||
|
||||
final Set<String> dsIdBlacklist = parseParamAsList(parser, "datasourceIdBlacklist");
|
||||
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
|
||||
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -103,22 +101,6 @@ public class GenerateEventsJob {
|
|||
|
||||
}
|
||||
|
||||
private static Set<String> parseParamAsList(final ArgumentApplicationParser parser, final String key) {
|
||||
final String s = parser.get(key).trim();
|
||||
|
||||
final Set<String> res = new HashSet<>();
|
||||
|
||||
if (s.length() > 1) { // A value of a single char (for example: '-') indicates an empty list
|
||||
Arrays
|
||||
.stream(s.split(","))
|
||||
.map(String::trim)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.forEach(res::add);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
public static Map<String, LongAccumulator> prepareAccumulators(final SparkContext sc) {
|
||||
|
||||
return EventFinder
|
||||
|
|
|
@ -48,7 +48,7 @@ public class IndexOnESJob {
|
|||
|
||||
final JavaRDD<String> inputRdd = ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.limit(10000) // TODO REMOVE
|
||||
// .limit(10000) // TODO REMOVE
|
||||
.map(IndexOnESJob::eventAsJsonString, Encoders.STRING())
|
||||
.javaRDD();
|
||||
|
||||
|
|
|
@ -25,7 +25,9 @@ public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
|
|||
protected List<OaBrokerProject> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
|
||||
if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) { return new ArrayList<>(); }
|
||||
if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
final Set<String> existingProjects = target
|
||||
.getProjects()
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
|
@ -10,6 +15,7 @@ import org.apache.spark.util.LongAccumulator;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
|
||||
public class ClusterUtils {
|
||||
|
@ -53,7 +59,9 @@ public class ClusterUtils {
|
|||
return o;
|
||||
}
|
||||
|
||||
public static <T> void save(final Dataset<T> dataset, final String path, final Class<T> clazz,
|
||||
public static <T> void save(final Dataset<T> dataset,
|
||||
final String path,
|
||||
final Class<T> clazz,
|
||||
final LongAccumulator acc) {
|
||||
dataset
|
||||
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
|
||||
|
@ -62,4 +70,20 @@ public class ClusterUtils {
|
|||
.json(path);
|
||||
}
|
||||
|
||||
public static Set<String> parseParamAsList(final ArgumentApplicationParser parser, final String key) {
|
||||
final String s = parser.get(key).trim();
|
||||
|
||||
final Set<String> res = new HashSet<>();
|
||||
|
||||
if (s.length() > 1) { // A value of a single char (for example: '-') indicates an empty list
|
||||
Arrays
|
||||
.stream(s.split(","))
|
||||
.map(String::trim)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.forEach(res::add);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -44,7 +44,9 @@ public class ConversionUtils {
|
|||
private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class);
|
||||
|
||||
public static List<OaBrokerInstance> oafInstanceToBrokerInstances(final Instance i) {
|
||||
if (i == null) { return new ArrayList<>(); }
|
||||
if (i == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return mappedList(i.getUrl(), url -> {
|
||||
final OaBrokerInstance res = new OaBrokerInstance();
|
||||
|
@ -65,7 +67,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static final OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) {
|
||||
if (d == null) { return null; }
|
||||
if (d == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
|
||||
res.setOpenaireId(d.getId());
|
||||
|
@ -78,7 +82,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) {
|
||||
if (p == null) { return null; }
|
||||
if (p == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
|
||||
res.setOpenaireId(p.getId());
|
||||
|
@ -92,7 +98,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static final OaBrokerMainEntity oafResultToBrokerResult(final Result result) {
|
||||
if (result == null) { return null; }
|
||||
if (result == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
|
||||
|
@ -109,7 +117,8 @@ public class ConversionUtils {
|
|||
res.setEmbargoenddate(fieldValue(result.getEmbargoenddate()));
|
||||
res.setContributor(fieldList(result.getContributor()));
|
||||
res
|
||||
.setJournal(result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
|
||||
.setJournal(
|
||||
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
|
||||
res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey));
|
||||
res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue));
|
||||
res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
|
@ -121,7 +130,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
|
||||
if (author == null) { return null; }
|
||||
if (author == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final String pids = author.getPid() != null ? author
|
||||
.getPid()
|
||||
|
@ -145,7 +156,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) {
|
||||
if (journal == null) { return null; }
|
||||
if (journal == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerJournal res = new OaBrokerJournal();
|
||||
res.setName(journal.getName());
|
||||
|
@ -157,7 +170,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) {
|
||||
if (ref == null) { return null; }
|
||||
if (ref == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerExternalReference res = new OaBrokerExternalReference();
|
||||
res.setRefidentifier(ref.getRefidentifier());
|
||||
|
@ -168,7 +183,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static final OaBrokerProject oafProjectToBrokerProject(final Project p) {
|
||||
if (p == null) { return null; }
|
||||
if (p == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerProject res = new OaBrokerProject();
|
||||
res.setOpenaireId(p.getId());
|
||||
|
@ -192,7 +209,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static final OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) {
|
||||
if (sw == null) { return null; }
|
||||
if (sw == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
|
||||
res.setOpenaireId(sw.getId());
|
||||
|
@ -255,7 +274,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static List<OaBrokerTypedValue> structPropTypedList(final List<StructuredProperty> list) {
|
||||
if (list == null) { return new ArrayList<>(); }
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
|
@ -265,7 +286,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
|
||||
if (list == null) { return new ArrayList<>(); }
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
|
@ -276,7 +299,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static <F, T> List<T> flatMappedList(final List<F> list, final Function<F, List<T>> func) {
|
||||
if (list == null) { return new ArrayList<>(); }
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
|
@ -288,7 +313,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static <F, T> T mappedFirst(final List<F> list, final Function<F, T> func) {
|
||||
if (list == null) { return null; }
|
||||
if (list == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
|
|
|
@ -70,7 +70,6 @@ public class EventFinder {
|
|||
matchers.add(new EnrichMissingDatasetReferences());
|
||||
matchers.add(new EnrichMissingDatasetIsSupplementedTo());
|
||||
matchers.add(new EnrichMissingDatasetIsSupplementedBy());
|
||||
matchers.add(new EnrichMissingAbstract());
|
||||
}
|
||||
|
||||
public static EventGroup generateEvents(final ResultGroup results,
|
||||
|
|
|
@ -447,7 +447,7 @@
|
|||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="2"
|
||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -73,33 +73,32 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="index_es"/>
|
||||
<start to="count"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="index_es">
|
||||
<action name="count">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>IndexOnESJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.IndexOnESJob</class>
|
||||
<name>Count</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.CheckDuplictedIdsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="2"
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--index</arg><arg>${esIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -45,6 +45,16 @@ public class EntityMergerTest implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void softwareMergerTest() throws InstantiationException, IllegalAccessException {
|
||||
List<Tuple2<String, Software>> softwares = readSample(testEntityBasePath + "/software_merge.json", Software.class);
|
||||
|
||||
Software merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
||||
|
||||
System.out.println(merged.getBestaccessright().getClassid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void publicationMergerTest() throws InstantiationException, IllegalAccessException {
|
||||
|
||||
|
|
|
@ -182,7 +182,7 @@ public class SparkDedupTest implements Serializable {
|
|||
.count();
|
||||
|
||||
assertEquals(3432, orgs_simrel);
|
||||
assertEquals(7152, pubs_simrel);
|
||||
assertEquals(7054, pubs_simrel);
|
||||
assertEquals(344, sw_simrel);
|
||||
assertEquals(458, ds_simrel);
|
||||
assertEquals(6750, orp_simrel);
|
||||
|
@ -234,7 +234,7 @@ public class SparkDedupTest implements Serializable {
|
|||
.count();
|
||||
|
||||
assertEquals(1276, orgs_mergerel);
|
||||
assertEquals(1442, pubs_mergerel);
|
||||
assertEquals(1440, pubs_mergerel);
|
||||
assertEquals(288, sw_mergerel);
|
||||
assertEquals(472, ds_mergerel);
|
||||
assertEquals(718, orp_mergerel);
|
||||
|
@ -423,7 +423,7 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
|
||||
|
||||
assertEquals(4975, relations);
|
||||
assertEquals(4971, relations);
|
||||
|
||||
// check deletedbyinference
|
||||
final Dataset<Relation> mergeRels = spark
|
||||
|
|
|
@ -6,10 +6,10 @@
|
|||
"subEntityType" : "resulttype",
|
||||
"subEntityValue" : "dataset",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "2000",
|
||||
"queueMaxSize" : "800",
|
||||
"groupMaxSize" : "100",
|
||||
"maxChildren" : "100",
|
||||
"slidingWindowSize" : "200",
|
||||
"slidingWindowSize" : "80",
|
||||
"rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||
"includeChildren" : "true",
|
||||
"idPath" : "$.id",
|
||||
|
@ -17,8 +17,7 @@
|
|||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||
{ "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } },
|
||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||
],
|
||||
"decisionTree" : {
|
||||
|
|
|
@ -6,10 +6,10 @@
|
|||
"subEntityType" : "resulttype",
|
||||
"subEntityValue" : "otherresearchproduct",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "2000",
|
||||
"queueMaxSize" : "800",
|
||||
"groupMaxSize" : "100",
|
||||
"maxChildren" : "100",
|
||||
"slidingWindowSize" : "200",
|
||||
"slidingWindowSize" : "80",
|
||||
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||
"includeChildren" : "true",
|
||||
"idPath" : "$.id",
|
||||
|
@ -17,8 +17,7 @@
|
|||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||
{ "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } },
|
||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||
],
|
||||
"decisionTree" : {
|
||||
|
|
|
@ -6,10 +6,10 @@
|
|||
"subEntityType": "resulttype",
|
||||
"subEntityValue": "publication",
|
||||
"orderField": "title",
|
||||
"queueMaxSize": "2000",
|
||||
"queueMaxSize": "800",
|
||||
"groupMaxSize": "100",
|
||||
"maxChildren": "100",
|
||||
"slidingWindowSize": "200",
|
||||
"slidingWindowSize": "80",
|
||||
"rootBuilder": [
|
||||
"result",
|
||||
"resultProject_outcome_isProducedBy",
|
||||
|
@ -29,8 +29,7 @@
|
|||
},
|
||||
"pace": {
|
||||
"clustering" : [
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||
{ "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } },
|
||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||
],
|
||||
"decisionTree": {
|
||||
|
|
|
@ -6,10 +6,10 @@
|
|||
"subEntityType" : "resulttype",
|
||||
"subEntityValue" : "software",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "2000",
|
||||
"queueMaxSize" : "800",
|
||||
"groupMaxSize" : "100",
|
||||
"maxChildren" : "100",
|
||||
"slidingWindowSize" : "200",
|
||||
"slidingWindowSize" : "80",
|
||||
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||
"includeChildren" : "true",
|
||||
"idPath" : "$.id",
|
||||
|
@ -17,8 +17,7 @@
|
|||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||
{ "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } },
|
||||
{ "name" : "lowercase", "fields" : [ "doi", "url" ], "params" : { } }
|
||||
],
|
||||
"decisionTree": {
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -8,6 +8,7 @@ import java.util.Objects;
|
|||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -97,7 +98,7 @@ public class CleanGraphSparkJob {
|
|||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static <T extends Oaf> T fixDefaults(T value) {
|
||||
protected static <T extends Oaf> T fixDefaults(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
|
@ -134,11 +135,6 @@ public class CleanGraphSparkJob {
|
|||
.setResourcetype(
|
||||
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
||||
}
|
||||
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
||||
r
|
||||
.setBestaccessright(
|
||||
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
}
|
||||
if (Objects.nonNull(r.getInstance())) {
|
||||
for (Instance i : r.getInstance()) {
|
||||
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
||||
|
@ -152,6 +148,15 @@ public class CleanGraphSparkJob {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
||||
Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance());
|
||||
if (Objects.isNull(bestaccessrights)) {
|
||||
r.setBestaccessright(
|
||||
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
} else {
|
||||
r.setBestaccessright(bestaccessrights);
|
||||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getAuthor())) {
|
||||
boolean nullRank = r
|
||||
.getAuthor()
|
||||
|
|
|
@ -378,6 +378,10 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
||||
|
||||
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
|
||||
return getBestAccessRights(instanceList);
|
||||
}
|
||||
|
||||
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
|
||||
if (instanceList != null) {
|
||||
final Optional<Qualifier> min = instanceList
|
||||
|
|
|
@ -57,6 +57,8 @@ public class CleaningFunctionTest {
|
|||
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
|
||||
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||
|
||||
assertNull(p_in.getBestaccessright());
|
||||
|
||||
assertTrue(p_in instanceof Result);
|
||||
assertTrue(p_in instanceof Publication);
|
||||
|
||||
|
@ -84,6 +86,9 @@ public class CleaningFunctionTest {
|
|||
.map(p -> p.getQualifier())
|
||||
.allMatch(q -> pidTerms.contains(q.getClassid())));
|
||||
|
||||
Publication p_defaults = CleanGraphSparkJob.fixDefaults(p_out);
|
||||
assertEquals("CLOSED", p_defaults.getBestaccessright().getClassid());
|
||||
|
||||
// TODO add more assertions to verity the cleaned values
|
||||
System.out.println(MAPPER.writeValueAsString(p_out));
|
||||
|
||||
|
|
|
@ -185,12 +185,7 @@
|
|||
"surname": ""
|
||||
}
|
||||
],
|
||||
"bestaccessright": {
|
||||
"classid": "CLOSED",
|
||||
"classname": "Closed Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"bestaccessright": null,
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||
|
|
|
@ -156,6 +156,7 @@ public class PrepareRelationsJob {
|
|||
.parquet(outputPath);
|
||||
}
|
||||
|
||||
// experimental
|
||||
private static void prepareRelationsDataset(
|
||||
SparkSession spark, String inputRelationsPath, String outputPath, Set<String> relationFilter, int maxRelations,
|
||||
int relPartitions) {
|
||||
|
|
|
@ -134,6 +134,7 @@
|
|||
<arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
|
||||
<arg>--maxRelations</arg><arg>${maxRelations}</arg>
|
||||
<arg>--relationFilter</arg><arg>${relationFilter}</arg>
|
||||
<arg>--relPartitions</arg><arg>5000</arg>
|
||||
</spark>
|
||||
<ok to="fork_join_related_entities"/>
|
||||
|
|
|
@ -11,6 +11,29 @@
|
|||
<WORKFLOW_TYPE>Data Provision</WORKFLOW_TYPE>
|
||||
<WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
|
||||
<CONFIGURATION start="manual">
|
||||
|
||||
<NODE isStart="true" name="setReuseContent" type="SetEnvParameter">
|
||||
<DESCRIPTION>reuse cached content from the aggregation system</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">reuseContent</PARAM>
|
||||
<PARAM function="validValues(['true', 'false'])" managedBy="user" name="parameterValue" required="true" type="string">true</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setContentPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>set the aggregator content path</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">contentPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_aggregator</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setAggregatorGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the path containing the AGGREGATOR graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -62,87 +85,94 @@
|
|||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setCleanedGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the CLEANED graph</DESCRIPTION>
|
||||
<NODE isStart="true" name="setOrcidGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the ORCID enriched graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">cleanedGraphPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/05_graph_cleaned</PARAM>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">orcidGraphPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/05_graph_orcid</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setOrcidGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the ORCID enriched graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">orcidGraphPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/06_graph_orcid</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
<NODE isStart="true" name="setBulkTaggingGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the BULK TAGGED graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">bulkTaggingGraphPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/07_graph_bulktagging</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/06_graph_bulktagging</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setAffiliationGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the AFFILIATION from INSTITUTIONAL REPOS graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">affiliationGraphPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/08_graph_affiliation</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/07_graph_affiliation</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setCommunityOrganizationGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the COMMUNITY from SELECTED SOURCES graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">communityOrganizationGraphPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/09_graph_comunity_organization</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/08_graph_comunity_organization</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setFundingGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the FUNDING from SEMANTIC RELATION graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">fundingGraphPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/10_graph_funding</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/09_graph_funding</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setCommunitySemRelGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the COMMUNITY from SEMANTIC RELATION graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">communitySemRelGraphPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/11_graph_comunity_sem_rel</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/10_graph_comunity_sem_rel</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setCountryGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the COUNTRY enriched graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">countryGraphPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/12_graph_country</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/11_graph_country</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setCleanedGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the CLEANED graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">cleanedGraphPath</PARAM>
|
||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/12_graph_cleaned</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setBlacklistedGraphPath" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the target path to store the blacklisted graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -153,6 +183,7 @@
|
|||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setIsLookUpUrl" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the lookup address</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -163,6 +194,7 @@
|
|||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setBulkTaggingPathMap" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the map of paths for the Bulk Tagging</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -173,6 +205,7 @@
|
|||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setPropagationOrganizationCommunityMap" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the map of associations organization, community list for the propagation of community to result through organization</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -185,6 +218,7 @@
|
|||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="setDedupConfig" type="SetEnvParameter">
|
||||
<DESCRIPTION>Set the dedup orchestrator name</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -195,6 +229,7 @@
|
|||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="actionSetsRaw" type="SetEnvParameter">
|
||||
<DESCRIPTION>declares the ActionSet ids to promote in the RAW graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -205,6 +240,7 @@
|
|||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isStart="true" name="actionSetsIIS" type="SetEnvParameter">
|
||||
<DESCRIPTION>declares the ActionSet ids to promote in the INFERRED graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -215,6 +251,7 @@
|
|||
<ARC to="waitConfig"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE isJoin="true" name="waitConfig">
|
||||
<DESCRIPTION>wait configurations</DESCRIPTION>
|
||||
<PARAMETERS/>
|
||||
|
@ -222,6 +259,7 @@
|
|||
<ARC to="aggregatorGraph"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="aggregatorGraph" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>create the AGGREGATOR graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -230,7 +268,9 @@
|
|||
<PARAM managedBy="system" name="envParams" required="true" type="string">
|
||||
{
|
||||
'graphOutputPath' : 'aggregatorGraphPath',
|
||||
'isLookupUrl' : 'isLookUpUrl'
|
||||
'isLookupUrl' : 'isLookUpUrl',
|
||||
'reuseContent' : 'reuseContent',
|
||||
'contentPath' : 'contentPath'
|
||||
}
|
||||
</PARAM>
|
||||
<PARAM managedBy="system" name="params" required="true" type="string">
|
||||
|
@ -241,8 +281,6 @@
|
|||
'postgresURL' : 'jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus',
|
||||
'postgresUser' : 'dnet',
|
||||
'postgresPassword' : '',
|
||||
'reuseContent' : 'false',
|
||||
'contentPath' : '/tmp/beta_provision/aggregator',
|
||||
'workingDir' : '/tmp/beta_provision/working_dir/aggregator'
|
||||
}
|
||||
</PARAM>
|
||||
|
@ -252,6 +290,7 @@
|
|||
<ARC to="promoteActionsRaw"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="promoteActionsRaw" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>create the RAW graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -289,6 +328,7 @@
|
|||
<ARC to="duplicateScan"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="duplicateScan" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>search for duplicates in the raw graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -314,6 +354,7 @@
|
|||
<ARC to="promoteActionsIIS"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="promoteActionsIIS" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>create the INFERRED graph</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -351,6 +392,7 @@
|
|||
<ARC to="dedupConsistency"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="dedupConsistency" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>mark duplicates as deleted and redistribute the relationships</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -375,41 +417,6 @@
|
|||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="graphCleaning" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>clean the properties in the graph typed as Qualifier according to the vocabulary indicated in schemeid</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
|
||||
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
|
||||
<PARAM managedBy="system" name="envParams" required="true" type="string">
|
||||
{
|
||||
'graphInputPath' : 'consistentGraphPath',
|
||||
'graphOutputPath': 'cleanedGraphPath',
|
||||
'isLookupUrl': 'isLookUpUrl'
|
||||
}
|
||||
</PARAM>
|
||||
<PARAM managedBy="system" name="params" required="true" type="string">
|
||||
{
|
||||
'oozie.wf.application.path' : '/lib/dnet/oa/graph/clean/oozie_app',
|
||||
'workingPath' : '/tmp/beta_provision/working_dir/clean'
|
||||
}
|
||||
</PARAM>
|
||||
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="SKIP_ENRICHMENT"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="SKIP_ENRICHMENT" type="Selection">
|
||||
<DESCRIPTION>Do we skip the graph enrichment steps? (Yes to prepare the graph for the IIS)</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM function="validValues(['YES', 'NO'])" managedBy="user" name="selection" required="true" type="string">NO</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC name="YES" to="success"/>
|
||||
<ARC name="NO" to="orcidPropagation"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
<NODE name="orcidPropagation" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>propagates ORCID among results linked by allowedsemrels semantic relationships</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -417,7 +424,7 @@
|
|||
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
|
||||
<PARAM managedBy="system" name="envParams" required="true" type="string">
|
||||
{
|
||||
'sourcePath' : 'cleanedGraphPath',
|
||||
'sourcePath' : 'consistentGraphPath',
|
||||
'outputPath': 'orcidGraphPath'
|
||||
}
|
||||
</PARAM>
|
||||
|
@ -435,6 +442,7 @@
|
|||
<ARC to="bulkTagging"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="bulkTagging" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>mark results respecting some rules as belonging to communities</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -460,6 +468,7 @@
|
|||
<ARC to="affiliationPropagation"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="affiliationPropagation" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>creates relashionships between results and organizations when the organizations are associated to institutional repositories</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -484,6 +493,7 @@
|
|||
<ARC to="communityOrganizationPropagation"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="communityOrganizationPropagation" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>marks as belonging to communities the result collected from datasources related to the organizations specified in the organizationCommunityMap</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -509,6 +519,7 @@
|
|||
<ARC to="resultProjectPropagation"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="resultProjectPropagation" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>created relation between projects and results linked to other results trough allowedsemrel semantic relations linked to projects</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -534,6 +545,7 @@
|
|||
<ARC to="communitySemrelPropagation"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="communitySemrelPropagation" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>tag as belonging to communitites result in in allowedsemrels relation with other result already linked to communities </DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -560,6 +572,7 @@
|
|||
<ARC to="countryPropagation"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="countryPropagation" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>associated to results colleced from allowedtypes and those in the whithelist the country of the organization(s) handling the datasource it is collected from </DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -584,10 +597,36 @@
|
|||
</PARAM>
|
||||
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="graphCleaning"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="graphCleaning" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>clean the properties in the graph typed as Qualifier according to the vocabulary indicated in schemeid</DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
|
||||
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
|
||||
<PARAM managedBy="system" name="envParams" required="true" type="string">
|
||||
{
|
||||
'graphInputPath' : 'countryGraphPath',
|
||||
'graphOutputPath': 'cleanedGraphPath',
|
||||
'isLookupUrl': 'isLookUpUrl'
|
||||
}
|
||||
</PARAM>
|
||||
<PARAM managedBy="system" name="params" required="true" type="string">
|
||||
{
|
||||
'oozie.wf.application.path' : '/lib/dnet/oa/graph/clean/oozie_app',
|
||||
'workingPath' : '/tmp/beta_provision/working_dir/clean'
|
||||
}
|
||||
</PARAM>
|
||||
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
|
||||
</PARAMETERS>
|
||||
<ARCS>
|
||||
<ARC to="blacklistRelations"/>
|
||||
</ARCS>
|
||||
</NODE>
|
||||
|
||||
<NODE name="blacklistRelations" type="SubmitHadoopJob">
|
||||
<DESCRIPTION>removes blacklisted relations </DESCRIPTION>
|
||||
<PARAMETERS>
|
||||
|
@ -595,7 +634,7 @@
|
|||
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
|
||||
<PARAM managedBy="system" name="envParams" required="true" type="string">
|
||||
{
|
||||
'sourcePath' : 'countryGraphPath',
|
||||
'sourcePath' : 'cleanedGraphPath',
|
||||
'outputPath': 'blacklistedGraphPath'
|
||||
}
|
||||
</PARAM>
|
||||
|
|
Loading…
Reference in New Issue