dnet-hadoop/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java

128 lines
5.4 KiB
Java
Raw Normal View History

2020-06-11 11:25:18 +02:00
package eu.dnetlib.dhp.broker.oa.util;
import java.util.ArrayList;
import java.util.List;
2020-06-29 16:33:32 +02:00
import java.util.Map;
2020-07-02 12:43:03 +02:00
import java.util.Set;
2020-06-29 16:33:32 +02:00
import org.apache.spark.util.LongAccumulator;
2020-07-02 12:43:03 +02:00
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
2020-06-11 11:25:18 +02:00
2020-06-22 08:51:31 +02:00
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
2020-07-15 09:18:40 +02:00
import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
2020-06-11 11:25:18 +02:00
import eu.dnetlib.dhp.broker.model.EventFactory;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
2020-06-29 08:43:56 +02:00
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
2020-06-26 11:20:45 +02:00
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
2020-06-29 08:43:56 +02:00
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMissingSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMoreSoftware;
2020-06-11 11:25:18 +02:00
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
2020-06-26 11:20:45 +02:00
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
2020-06-11 11:25:18 +02:00
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
public class EventFinder {
2020-07-02 12:43:03 +02:00
private static final Logger log = LoggerFactory.getLogger(EventFinder.class);
2020-06-29 16:33:32 +02:00
private static final List<UpdateMatcher<?>> matchers = new ArrayList<>();
2020-06-11 11:25:18 +02:00
static {
matchers.add(new EnrichMissingAbstract());
2020-06-26 11:20:45 +02:00
matchers.add(new EnrichMissingAuthorOrcid());
matchers.add(new EnrichMissingOpenAccess());
matchers.add(new EnrichMissingPid());
matchers.add(new EnrichMissingPublicationDate());
matchers.add(new EnrichMissingSubject());
matchers.add(new EnrichMoreOpenAccess());
matchers.add(new EnrichMorePid());
matchers.add(new EnrichMoreSubject());
2020-06-11 11:25:18 +02:00
2020-06-29 16:33:32 +02:00
// Advanced matchers
2020-06-26 11:20:45 +02:00
matchers.add(new EnrichMissingProject());
2020-06-29 08:43:56 +02:00
matchers.add(new EnrichMoreProject());
matchers.add(new EnrichMissingSoftware());
matchers.add(new EnrichMoreSoftware());
matchers.add(new EnrichMissingPublicationIsRelatedTo());
matchers.add(new EnrichMissingPublicationIsReferencedBy());
matchers.add(new EnrichMissingPublicationReferences());
matchers.add(new EnrichMissingPublicationIsSupplementedTo());
matchers.add(new EnrichMissingPublicationIsSupplementedBy());
matchers.add(new EnrichMissingDatasetIsRelatedTo());
matchers.add(new EnrichMissingDatasetIsReferencedBy());
matchers.add(new EnrichMissingDatasetReferences());
matchers.add(new EnrichMissingDatasetIsSupplementedTo());
matchers.add(new EnrichMissingDatasetIsSupplementedBy());
2020-06-11 11:25:18 +02:00
}
2020-06-29 16:33:32 +02:00
public static EventGroup generateEvents(final ResultGroup results,
2020-07-02 12:43:03 +02:00
final Set<String> dsIdWhitelist,
final Set<String> dsIdBlacklist,
final Set<String> dsTypeWhitelist,
2020-12-14 11:03:55 +01:00
final Set<String> topicWhitelist,
2020-06-29 16:33:32 +02:00
final Map<String, LongAccumulator> accumulators) {
2020-07-02 12:43:03 +02:00
2020-06-11 11:25:18 +02:00
final List<UpdateInfo<?>> list = new ArrayList<>();
2020-06-22 08:51:31 +02:00
for (final OaBrokerMainEntity target : results.getData()) {
2020-07-15 09:18:40 +02:00
for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
for (final UpdateMatcher<?> matcher : matchers) {
2020-12-14 11:03:55 +01:00
for (final UpdateInfo<?> info : matcher
.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) {
if (topicWhitelist == null || topicWhitelist.isEmpty()
|| topicWhitelist.contains(info.getTopic().getPath())) {
list.add(info);
}
}
2020-07-15 09:18:40 +02:00
}
2020-07-02 12:43:03 +02:00
}
2020-06-11 11:25:18 +02:00
}
}
return asEventGroup(list);
}
2020-07-15 09:18:40 +02:00
private static boolean verifyTarget(final OaBrokerRelatedDatasource target,
2020-07-02 12:43:03 +02:00
final Set<String> dsIdWhitelist,
final Set<String> dsIdBlacklist,
final Set<String> dsTypeWhitelist) {
2020-07-15 09:18:40 +02:00
if (dsIdWhitelist.contains(target.getOpenaireId())) {
2020-07-02 12:43:03 +02:00
return true;
2020-07-15 09:18:40 +02:00
} else if (dsIdBlacklist.contains(target.getOpenaireId())) {
2020-07-02 12:43:03 +02:00
return false;
} else {
2020-07-15 09:18:40 +02:00
return dsTypeWhitelist.contains(target.getType());
2020-07-02 12:43:03 +02:00
}
}
2020-06-11 11:25:18 +02:00
private static EventGroup asEventGroup(final List<UpdateInfo<?>> list) {
final EventGroup events = new EventGroup();
list.stream().map(EventFactory::newBrokerEvent).forEach(events::addElement);
return events;
}
2020-06-29 16:33:32 +02:00
public static List<UpdateMatcher<?>> getMatchers() {
return matchers;
}
2020-06-11 11:25:18 +02:00
}