forked from D-Net/dnet-hadoop
fixed a bug with duplicated events
This commit is contained in:
parent
edf6c6c4dc
commit
efadbdb2bc
|
@ -0,0 +1,112 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class CheckDuplictedIdsJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CheckDuplictedIdsJob.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
CheckDuplictedIdsJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String countPath = parser.get("workingPath") + "/counts";
|
||||
log.info("countPath: {}", countPath);
|
||||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("invaild_event_id");
|
||||
|
||||
final TypedColumn<Tuple2<String, Long>, Tuple2<String, Long>> agg = new CountAggregator().toColumn();
|
||||
|
||||
ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.map(e -> new Tuple2<>(e.getEventId(), 1l), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.groupByKey(t -> t._1, Encoders.STRING())
|
||||
.agg(agg)
|
||||
.map(t -> t._2, Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.filter(t -> t._2 > 1)
|
||||
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(countPath);
|
||||
;
|
||||
|
||||
}
|
||||
|
||||
private static String eventAsJsonString(final Event f) throws JsonProcessingException {
|
||||
return new ObjectMapper().writeValueAsString(f);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class CountAggregator extends Aggregator<Tuple2<String, Long>, Tuple2<String, Long>, Tuple2<String, Long>> {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1395935985734672538L;
|
||||
|
||||
@Override
|
||||
public Encoder<Tuple2<String, Long>> bufferEncoder() {
|
||||
return Encoders.tuple(Encoders.STRING(), Encoders.LONG());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> finish(final Tuple2<String, Long> arg0) {
|
||||
return arg0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> merge(final Tuple2<String, Long> arg0, final Tuple2<String, Long> arg1) {
|
||||
final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1);
|
||||
return new Tuple2<>(s, arg0._2 + arg1._2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<Tuple2<String, Long>> outputEncoder() {
|
||||
return Encoders.tuple(Encoders.STRING(), Encoders.LONG());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> reduce(final Tuple2<String, Long> arg0, final Tuple2<String, Long> arg1) {
|
||||
final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1);
|
||||
return new Tuple2<>(s, arg0._2 + arg1._2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> zero() {
|
||||
return new Tuple2<>(null, 0l);
|
||||
}
|
||||
|
||||
}
|
|
@ -3,8 +3,6 @@ package eu.dnetlib.dhp.broker.oa;
|
|||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
@ -63,13 +61,13 @@ public class GenerateEventsJob {
|
|||
final String eventsPath = workingPath + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final Set<String> dsIdWhitelist = parseParamAsList(parser, "datasourceIdWhitelist");
|
||||
final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
|
||||
log.info("datasourceIdWhitelist: {}", StringUtils.join(dsIdWhitelist, ","));
|
||||
|
||||
final Set<String> dsTypeWhitelist = parseParamAsList(parser, "datasourceTypeWhitelist");
|
||||
final Set<String> dsTypeWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceTypeWhitelist");
|
||||
log.info("datasourceTypeWhitelist: {}", StringUtils.join(dsTypeWhitelist, ","));
|
||||
|
||||
final Set<String> dsIdBlacklist = parseParamAsList(parser, "datasourceIdBlacklist");
|
||||
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
|
||||
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -103,22 +101,6 @@ public class GenerateEventsJob {
|
|||
|
||||
}
|
||||
|
||||
private static Set<String> parseParamAsList(final ArgumentApplicationParser parser, final String key) {
|
||||
final String s = parser.get(key).trim();
|
||||
|
||||
final Set<String> res = new HashSet<>();
|
||||
|
||||
if (s.length() > 1) { // A value of a single char (for example: '-') indicates an empty list
|
||||
Arrays
|
||||
.stream(s.split(","))
|
||||
.map(String::trim)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.forEach(res::add);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
public static Map<String, LongAccumulator> prepareAccumulators(final SparkContext sc) {
|
||||
|
||||
return EventFinder
|
||||
|
|
|
@ -48,7 +48,7 @@ public class IndexOnESJob {
|
|||
|
||||
final JavaRDD<String> inputRdd = ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.limit(10000) // TODO REMOVE
|
||||
// .limit(10000) // TODO REMOVE
|
||||
.map(IndexOnESJob::eventAsJsonString, Encoders.STRING())
|
||||
.javaRDD();
|
||||
|
||||
|
|
|
@ -25,7 +25,9 @@ public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
|
|||
protected List<OaBrokerProject> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
|
||||
if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) { return new ArrayList<>(); }
|
||||
if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
final Set<String> existingProjects = target
|
||||
.getProjects()
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
|
@ -10,6 +15,7 @@ import org.apache.spark.util.LongAccumulator;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
|
||||
public class ClusterUtils {
|
||||
|
@ -53,7 +59,9 @@ public class ClusterUtils {
|
|||
return o;
|
||||
}
|
||||
|
||||
public static <T> void save(final Dataset<T> dataset, final String path, final Class<T> clazz,
|
||||
public static <T> void save(final Dataset<T> dataset,
|
||||
final String path,
|
||||
final Class<T> clazz,
|
||||
final LongAccumulator acc) {
|
||||
dataset
|
||||
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
|
||||
|
@ -62,4 +70,20 @@ public class ClusterUtils {
|
|||
.json(path);
|
||||
}
|
||||
|
||||
public static Set<String> parseParamAsList(final ArgumentApplicationParser parser, final String key) {
|
||||
final String s = parser.get(key).trim();
|
||||
|
||||
final Set<String> res = new HashSet<>();
|
||||
|
||||
if (s.length() > 1) { // A value of a single char (for example: '-') indicates an empty list
|
||||
Arrays
|
||||
.stream(s.split(","))
|
||||
.map(String::trim)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.forEach(res::add);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -44,7 +44,9 @@ public class ConversionUtils {
|
|||
private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class);
|
||||
|
||||
public static List<OaBrokerInstance> oafInstanceToBrokerInstances(final Instance i) {
|
||||
if (i == null) { return new ArrayList<>(); }
|
||||
if (i == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return mappedList(i.getUrl(), url -> {
|
||||
final OaBrokerInstance res = new OaBrokerInstance();
|
||||
|
@ -65,7 +67,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static final OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) {
|
||||
if (d == null) { return null; }
|
||||
if (d == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
|
||||
res.setOpenaireId(d.getId());
|
||||
|
@ -78,7 +82,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) {
|
||||
if (p == null) { return null; }
|
||||
if (p == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
|
||||
res.setOpenaireId(p.getId());
|
||||
|
@ -92,7 +98,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static final OaBrokerMainEntity oafResultToBrokerResult(final Result result) {
|
||||
if (result == null) { return null; }
|
||||
if (result == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
|
||||
|
@ -109,7 +117,8 @@ public class ConversionUtils {
|
|||
res.setEmbargoenddate(fieldValue(result.getEmbargoenddate()));
|
||||
res.setContributor(fieldList(result.getContributor()));
|
||||
res
|
||||
.setJournal(result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
|
||||
.setJournal(
|
||||
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
|
||||
res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey));
|
||||
res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue));
|
||||
res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
|
@ -121,7 +130,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
|
||||
if (author == null) { return null; }
|
||||
if (author == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final String pids = author.getPid() != null ? author
|
||||
.getPid()
|
||||
|
@ -145,7 +156,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) {
|
||||
if (journal == null) { return null; }
|
||||
if (journal == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerJournal res = new OaBrokerJournal();
|
||||
res.setName(journal.getName());
|
||||
|
@ -157,7 +170,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) {
|
||||
if (ref == null) { return null; }
|
||||
if (ref == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerExternalReference res = new OaBrokerExternalReference();
|
||||
res.setRefidentifier(ref.getRefidentifier());
|
||||
|
@ -168,7 +183,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static final OaBrokerProject oafProjectToBrokerProject(final Project p) {
|
||||
if (p == null) { return null; }
|
||||
if (p == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerProject res = new OaBrokerProject();
|
||||
res.setOpenaireId(p.getId());
|
||||
|
@ -192,7 +209,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static final OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) {
|
||||
if (sw == null) { return null; }
|
||||
if (sw == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
|
||||
res.setOpenaireId(sw.getId());
|
||||
|
@ -255,7 +274,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static List<OaBrokerTypedValue> structPropTypedList(final List<StructuredProperty> list) {
|
||||
if (list == null) { return new ArrayList<>(); }
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
|
@ -265,7 +286,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
|
||||
if (list == null) { return new ArrayList<>(); }
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
|
@ -276,7 +299,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static <F, T> List<T> flatMappedList(final List<F> list, final Function<F, List<T>> func) {
|
||||
if (list == null) { return new ArrayList<>(); }
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
|
@ -288,7 +313,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static <F, T> T mappedFirst(final List<F> list, final Function<F, T> func) {
|
||||
if (list == null) { return null; }
|
||||
if (list == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
|
|
|
@ -70,7 +70,6 @@ public class EventFinder {
|
|||
matchers.add(new EnrichMissingDatasetReferences());
|
||||
matchers.add(new EnrichMissingDatasetIsSupplementedTo());
|
||||
matchers.add(new EnrichMissingDatasetIsSupplementedBy());
|
||||
matchers.add(new EnrichMissingAbstract());
|
||||
}
|
||||
|
||||
public static EventGroup generateEvents(final ResultGroup results,
|
||||
|
|
|
@ -447,7 +447,7 @@
|
|||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="2"
|
||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -73,33 +73,32 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="index_es"/>
|
||||
<start to="count"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="index_es">
|
||||
<action name="count">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>IndexOnESJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.IndexOnESJob</class>
|
||||
<name>Count</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.CheckDuplictedIdsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="2"
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--index</arg><arg>${esIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
Loading…
Reference in New Issue