enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
9 changed files with 192 additions and 47 deletions
Showing only changes of commit efadbdb2bc - Show all commits

View File

@ -0,0 +1,112 @@
package eu.dnetlib.dhp.broker.oa;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.TypedColumn;
import org.apache.spark.sql.expressions.Aggregator;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import scala.Tuple2;
public class CheckDuplictedIdsJob {
private static final Logger log = LoggerFactory.getLogger(CheckDuplictedIdsJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
CheckDuplictedIdsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events";
log.info("eventsPath: {}", eventsPath);
final String countPath = parser.get("workingPath") + "/counts";
log.info("countPath: {}", countPath);
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
final LongAccumulator total = spark.sparkContext().longAccumulator("invaild_event_id");
final TypedColumn<Tuple2<String, Long>, Tuple2<String, Long>> agg = new CountAggregator().toColumn();
ClusterUtils
.readPath(spark, eventsPath, Event.class)
.map(e -> new Tuple2<>(e.getEventId(), 1l), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
.groupByKey(t -> t._1, Encoders.STRING())
.agg(agg)
.map(t -> t._2, Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
.filter(t -> t._2 > 1)
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
.write()
.mode(SaveMode.Overwrite)
.json(countPath);
;
}
private static String eventAsJsonString(final Event f) throws JsonProcessingException {
return new ObjectMapper().writeValueAsString(f);
}
}
class CountAggregator extends Aggregator<Tuple2<String, Long>, Tuple2<String, Long>, Tuple2<String, Long>> {
/**
*
*/
private static final long serialVersionUID = 1395935985734672538L;
@Override
public Encoder<Tuple2<String, Long>> bufferEncoder() {
return Encoders.tuple(Encoders.STRING(), Encoders.LONG());
}
@Override
public Tuple2<String, Long> finish(final Tuple2<String, Long> arg0) {
return arg0;
}
@Override
public Tuple2<String, Long> merge(final Tuple2<String, Long> arg0, final Tuple2<String, Long> arg1) {
final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1);
return new Tuple2<>(s, arg0._2 + arg1._2);
}
@Override
public Encoder<Tuple2<String, Long>> outputEncoder() {
return Encoders.tuple(Encoders.STRING(), Encoders.LONG());
}
@Override
public Tuple2<String, Long> reduce(final Tuple2<String, Long> arg0, final Tuple2<String, Long> arg1) {
final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1);
return new Tuple2<>(s, arg0._2 + arg1._2);
}
@Override
public Tuple2<String, Long> zero() {
return new Tuple2<>(null, 0l);
}
}

View File

@ -3,8 +3,6 @@ package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
@ -63,13 +61,13 @@ public class GenerateEventsJob {
final String eventsPath = workingPath + "/events";
log.info("eventsPath: {}", eventsPath);
final Set<String> dsIdWhitelist = parseParamAsList(parser, "datasourceIdWhitelist");
final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
log.info("datasourceIdWhitelist: {}", StringUtils.join(dsIdWhitelist, ","));
final Set<String> dsTypeWhitelist = parseParamAsList(parser, "datasourceTypeWhitelist");
final Set<String> dsTypeWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceTypeWhitelist");
log.info("datasourceTypeWhitelist: {}", StringUtils.join(dsTypeWhitelist, ","));
final Set<String> dsIdBlacklist = parseParamAsList(parser, "datasourceIdBlacklist");
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
final SparkConf conf = new SparkConf();
@ -103,22 +101,6 @@ public class GenerateEventsJob {
}
private static Set<String> parseParamAsList(final ArgumentApplicationParser parser, final String key) {
final String s = parser.get(key).trim();
final Set<String> res = new HashSet<>();
if (s.length() > 1) { // A value of a single char (for example: '-') indicates an empty list
Arrays
.stream(s.split(","))
.map(String::trim)
.filter(StringUtils::isNotBlank)
.forEach(res::add);
}
return res;
}
public static Map<String, LongAccumulator> prepareAccumulators(final SparkContext sc) {
return EventFinder

View File

@ -48,7 +48,7 @@ public class IndexOnESJob {
final JavaRDD<String> inputRdd = ClusterUtils
.readPath(spark, eventsPath, Event.class)
.limit(10000) // TODO REMOVE
// .limit(10000) // TODO REMOVE
.map(IndexOnESJob::eventAsJsonString, Encoders.STRING())
.javaRDD();

View File

@ -25,7 +25,9 @@ public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
protected List<OaBrokerProject> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) { return new ArrayList<>(); }
if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final Set<String> existingProjects = target
.getProjects()

View File

@ -1,6 +1,11 @@
package eu.dnetlib.dhp.broker.oa.util;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
@ -10,6 +15,7 @@ import org.apache.spark.util.LongAccumulator;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
public class ClusterUtils {
@ -53,7 +59,9 @@ public class ClusterUtils {
return o;
}
public static <T> void save(final Dataset<T> dataset, final String path, final Class<T> clazz,
public static <T> void save(final Dataset<T> dataset,
final String path,
final Class<T> clazz,
final LongAccumulator acc) {
dataset
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
@ -62,4 +70,20 @@ public class ClusterUtils {
.json(path);
}
public static Set<String> parseParamAsList(final ArgumentApplicationParser parser, final String key) {
final String s = parser.get(key).trim();
final Set<String> res = new HashSet<>();
if (s.length() > 1) { // A value of a single char (for example: '-') indicates an empty list
Arrays
.stream(s.split(","))
.map(String::trim)
.filter(StringUtils::isNotBlank)
.forEach(res::add);
}
return res;
}
}

View File

@ -44,7 +44,9 @@ public class ConversionUtils {
private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class);
public static List<OaBrokerInstance> oafInstanceToBrokerInstances(final Instance i) {
if (i == null) { return new ArrayList<>(); }
if (i == null) {
return new ArrayList<>();
}
return mappedList(i.getUrl(), url -> {
final OaBrokerInstance res = new OaBrokerInstance();
@ -65,7 +67,9 @@ public class ConversionUtils {
}
public static final OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) {
if (d == null) { return null; }
if (d == null) {
return null;
}
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
res.setOpenaireId(d.getId());
@ -78,7 +82,9 @@ public class ConversionUtils {
}
public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) {
if (p == null) { return null; }
if (p == null) {
return null;
}
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
res.setOpenaireId(p.getId());
@ -92,7 +98,9 @@ public class ConversionUtils {
}
public static final OaBrokerMainEntity oafResultToBrokerResult(final Result result) {
if (result == null) { return null; }
if (result == null) {
return null;
}
final OaBrokerMainEntity res = new OaBrokerMainEntity();
@ -109,7 +117,8 @@ public class ConversionUtils {
res.setEmbargoenddate(fieldValue(result.getEmbargoenddate()));
res.setContributor(fieldList(result.getContributor()));
res
.setJournal(result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
.setJournal(
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey));
res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue));
res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid));
@ -121,7 +130,9 @@ public class ConversionUtils {
}
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
if (author == null) { return null; }
if (author == null) {
return null;
}
final String pids = author.getPid() != null ? author
.getPid()
@ -145,7 +156,9 @@ public class ConversionUtils {
}
private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) {
if (journal == null) { return null; }
if (journal == null) {
return null;
}
final OaBrokerJournal res = new OaBrokerJournal();
res.setName(journal.getName());
@ -157,7 +170,9 @@ public class ConversionUtils {
}
private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) {
if (ref == null) { return null; }
if (ref == null) {
return null;
}
final OaBrokerExternalReference res = new OaBrokerExternalReference();
res.setRefidentifier(ref.getRefidentifier());
@ -168,7 +183,9 @@ public class ConversionUtils {
}
public static final OaBrokerProject oafProjectToBrokerProject(final Project p) {
if (p == null) { return null; }
if (p == null) {
return null;
}
final OaBrokerProject res = new OaBrokerProject();
res.setOpenaireId(p.getId());
@ -192,7 +209,9 @@ public class ConversionUtils {
}
public static final OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) {
if (sw == null) { return null; }
if (sw == null) {
return null;
}
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
res.setOpenaireId(sw.getId());
@ -255,7 +274,9 @@ public class ConversionUtils {
}
private static List<OaBrokerTypedValue> structPropTypedList(final List<StructuredProperty> list) {
if (list == null) { return new ArrayList<>(); }
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
@ -265,7 +286,9 @@ public class ConversionUtils {
}
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
if (list == null) { return new ArrayList<>(); }
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
@ -276,7 +299,9 @@ public class ConversionUtils {
}
private static <F, T> List<T> flatMappedList(final List<F> list, final Function<F, List<T>> func) {
if (list == null) { return new ArrayList<>(); }
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
@ -288,7 +313,9 @@ public class ConversionUtils {
}
private static <F, T> T mappedFirst(final List<F> list, final Function<F, T> func) {
if (list == null) { return null; }
if (list == null) {
return null;
}
return list
.stream()

View File

@ -70,7 +70,6 @@ public class EventFinder {
matchers.add(new EnrichMissingDatasetReferences());
matchers.add(new EnrichMissingDatasetIsSupplementedTo());
matchers.add(new EnrichMissingDatasetIsSupplementedBy());
matchers.add(new EnrichMissingAbstract());
}
public static EventGroup generateEvents(final ResultGroup results,

View File

@ -447,7 +447,7 @@
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors="2"
--conf spark.dynamicAllocation.maxExecutors="8"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -73,33 +73,32 @@
</configuration>
</global>
<start to="index_es"/>
<start to="count"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="index_es">
<action name="count">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>IndexOnESJob</name>
<class>eu.dnetlib.dhp.broker.oa.IndexOnESJob</class>
<name>Count</name>
<class>eu.dnetlib.dhp.broker.oa.CheckDuplictedIdsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors="2"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--index</arg><arg>${esIndexName}</arg>
<arg>--esHost</arg><arg>${esIndexHost}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>