enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
4 changed files with 180 additions and 67 deletions
Showing only changes of commit 4eb3e109d7 - Show all commits

View File

@ -2,7 +2,6 @@
package eu.dnetlib.dhp.broker.model; package eu.dnetlib.dhp.broker.model;
import java.io.Serializable; import java.io.Serializable;
import java.util.Map;
public class Event implements Serializable { public class Event implements Serializable {
@ -25,7 +24,7 @@ public class Event implements Serializable {
private boolean instantMessage; private boolean instantMessage;
private Map<String, Object> map; private MappedFields map;
public Event() { public Event() {
} }
@ -33,7 +32,7 @@ public class Event implements Serializable {
public Event(final String producerId, final String eventId, final String topic, final String payload, public Event(final String producerId, final String eventId, final String topic, final String payload,
final Long creationDate, final Long expiryDate, final Long creationDate, final Long expiryDate,
final boolean instantMessage, final boolean instantMessage,
final Map<String, Object> map) { final MappedFields map) {
this.producerId = producerId; this.producerId = producerId;
this.eventId = eventId; this.eventId = eventId;
this.topic = topic; this.topic = topic;
@ -100,11 +99,11 @@ public class Event implements Serializable {
this.instantMessage = instantMessage; this.instantMessage = instantMessage;
} }
public Map<String, Object> getMap() { public MappedFields getMap() {
return this.map; return this.map;
} }
public void setMap(final Map<String, Object> map) { public void setMap(final MappedFields map) {
this.map = map; this.map = map;
} }
} }

View File

@ -3,9 +3,8 @@ package eu.dnetlib.dhp.broker.model;
import java.text.ParseException; import java.text.ParseException;
import java.util.Date; import java.util.Date;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.stream.Collectors;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -30,7 +29,7 @@ public class EventFactory {
final Event res = new Event(); final Event res = new Event();
final Map<String, Object> map = createMapFromResult(updateInfo); final MappedFields map = createMapFromResult(updateInfo);
final String eventId = calculateEventId( final String eventId = calculateEventId(
updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString()); updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString());
@ -46,35 +45,35 @@ public class EventFactory {
return res; return res;
} }
private static Map<String, Object> createMapFromResult(final UpdateInfo<?> updateInfo) { private static MappedFields createMapFromResult(final UpdateInfo<?> updateInfo) {
final Map<String, Object> map = new HashMap<>(); final MappedFields map = new MappedFields();
final OaBrokerMainEntity source = updateInfo.getSource(); final OaBrokerMainEntity source = updateInfo.getSource();
final OaBrokerMainEntity target = updateInfo.getTarget(); final OaBrokerMainEntity target = updateInfo.getTarget();
map.put("target_datasource_id", target.getCollectedFromId()); map.setTargetDatasourceId(target.getCollectedFromId());
map.put("target_datasource_name", target.getCollectedFromName()); map.setTargetDatasourceName(target.getCollectedFromName());
map.put("target_publication_id", target.getOpenaireId()); map.setTargetResultId(target.getOpenaireId());
final List<String> titles = target.getTitles(); final List<String> titles = target.getTitles();
if (titles.size() > 0) { if (titles.size() > 0) {
map.put("target_publication_title", titles.get(0)); map.setTargetResultTitle(titles.get(0));
} }
final long date = parseDateTolong(target.getPublicationdate()); final long date = parseDateTolong(target.getPublicationdate());
if (date > 0) { if (date > 0) {
map.put("target_dateofacceptance", date); map.setTargetDateofacceptance(date);
} }
map.put("target_publication_subject_list", target.getSubjects()); map.setTargetSubjects(target.getSubjects().stream().map(s -> s.getValue()).collect(Collectors.toList()));
map.put("target_publication_author_list", target.getCreators()); map.setTargetAuthors(target.getCreators().stream().map(a -> a.getFullname()).collect(Collectors.toList()));
// PROVENANCE INFO // PROVENANCE INFO
map.put("trust", updateInfo.getTrust()); map.setTrust(updateInfo.getTrust());
map.put("provenance_datasource_id", source.getCollectedFromId()); map.setProvenanceDatasourceId(source.getCollectedFromId());
map.put("provenance_datasource_name", source.getCollectedFromName()); map.setProvenanceDatasourceName(source.getCollectedFromName());
map.put("provenance_publication_id_list", source.getOpenaireId()); map.setProvenanceResultId(source.getOpenaireId());
return map; return map;
} }

View File

@ -0,0 +1,137 @@
package eu.dnetlib.dhp.broker.model;
import java.io.Serializable;
import java.util.List;
import org.codehaus.jackson.annotate.JsonProperty;
public class MappedFields implements Serializable {
/**
*
*/
private static final long serialVersionUID = -7999704113195802008L;
@JsonProperty("target_datasource_id")
private String targetDatasourceId;
@JsonProperty("target_datasource_name")
private String targetDatasourceName;
@JsonProperty("target_result_id")
private String targetResultId;
@JsonProperty("target_result_title")
private String targetResultTitle;
@JsonProperty("target_dateofacceptance")
private long targetDateofacceptance;
@JsonProperty("target_result_subject_list")
private List<String> targetSubjects;
@JsonProperty("target_result_author_list")
private List<String> targetAuthors;
@JsonProperty("trust")
private float trust;
@JsonProperty("provenance_datasource_id")
private String provenanceDatasourceId;
@JsonProperty("provenance_datasource_name")
private String provenanceDatasourceName;
@JsonProperty("setProvenanceResultId")
private String provenanceResultId;
public String getTargetDatasourceId() {
return targetDatasourceId;
}
public void setTargetDatasourceId(final String targetDatasourceId) {
this.targetDatasourceId = targetDatasourceId;
}
public String getTargetDatasourceName() {
return targetDatasourceName;
}
public void setTargetDatasourceName(final String targetDatasourceName) {
this.targetDatasourceName = targetDatasourceName;
}
public String getTargetResultId() {
return targetResultId;
}
public void setTargetResultId(final String targetResultId) {
this.targetResultId = targetResultId;
}
public String getTargetResultTitle() {
return targetResultTitle;
}
public void setTargetResultTitle(final String targetResultTitle) {
this.targetResultTitle = targetResultTitle;
}
public long getTargetDateofacceptance() {
return targetDateofacceptance;
}
public void setTargetDateofacceptance(final long targetDateofacceptance) {
this.targetDateofacceptance = targetDateofacceptance;
}
public List<String> getTargetSubjects() {
return targetSubjects;
}
public void setTargetSubjects(final List<String> targetSubjects) {
this.targetSubjects = targetSubjects;
}
public List<String> getTargetAuthors() {
return targetAuthors;
}
public void setTargetAuthors(final List<String> targetAuthors) {
this.targetAuthors = targetAuthors;
}
public float getTrust() {
return trust;
}
public void setTrust(final float trust) {
this.trust = trust;
}
public String getProvenanceDatasourceId() {
return provenanceDatasourceId;
}
public void setProvenanceDatasourceId(final String provenanceDatasourceId) {
this.provenanceDatasourceId = provenanceDatasourceId;
}
public String getProvenanceDatasourceName() {
return provenanceDatasourceName;
}
public void setProvenanceDatasourceName(final String provenanceDatasourceName) {
this.provenanceDatasourceName = provenanceDatasourceName;
}
public String getProvenanceResultId() {
return provenanceResultId;
}
public void setProvenanceResultId(final String provenanceResultId) {
this.provenanceResultId = provenanceResultId;
}
}

View File

@ -7,29 +7,7 @@ import java.util.List;
import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.EventFactory; import eu.dnetlib.dhp.broker.model.EventFactory;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMissingSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMoreSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract; import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
@ -38,31 +16,31 @@ public class EventFinder {
private static List<UpdateMatcher<?>> matchers = new ArrayList<>(); private static List<UpdateMatcher<?>> matchers = new ArrayList<>();
static { static {
matchers.add(new EnrichMissingAbstract()); matchers.add(new EnrichMissingAbstract());
matchers.add(new EnrichMissingAuthorOrcid()); // matchers.add(new EnrichMissingAuthorOrcid());
matchers.add(new EnrichMissingOpenAccess()); // matchers.add(new EnrichMissingOpenAccess());
matchers.add(new EnrichMissingPid()); // matchers.add(new EnrichMissingPid());
matchers.add(new EnrichMissingPublicationDate()); // matchers.add(new EnrichMissingPublicationDate());
matchers.add(new EnrichMissingSubject()); // matchers.add(new EnrichMissingSubject());
matchers.add(new EnrichMoreOpenAccess()); // matchers.add(new EnrichMoreOpenAccess());
matchers.add(new EnrichMorePid()); // matchers.add(new EnrichMorePid());
matchers.add(new EnrichMoreSubject()); // matchers.add(new EnrichMoreSubject());
// Advanced matchers // // Advanced matchers
matchers.add(new EnrichMissingProject()); // matchers.add(new EnrichMissingProject());
matchers.add(new EnrichMoreProject()); // matchers.add(new EnrichMoreProject());
matchers.add(new EnrichMissingSoftware()); // matchers.add(new EnrichMissingSoftware());
matchers.add(new EnrichMoreSoftware()); // matchers.add(new EnrichMoreSoftware());
matchers.add(new EnrichMissingPublicationIsRelatedTo()); // matchers.add(new EnrichMissingPublicationIsRelatedTo());
matchers.add(new EnrichMissingPublicationIsReferencedBy()); // matchers.add(new EnrichMissingPublicationIsReferencedBy());
matchers.add(new EnrichMissingPublicationReferences()); // matchers.add(new EnrichMissingPublicationReferences());
matchers.add(new EnrichMissingPublicationIsSupplementedTo()); // matchers.add(new EnrichMissingPublicationIsSupplementedTo());
matchers.add(new EnrichMissingPublicationIsSupplementedBy()); // matchers.add(new EnrichMissingPublicationIsSupplementedBy());
matchers.add(new EnrichMissingDatasetIsRelatedTo()); // matchers.add(new EnrichMissingDatasetIsRelatedTo());
matchers.add(new EnrichMissingDatasetIsReferencedBy()); // matchers.add(new EnrichMissingDatasetIsReferencedBy());
matchers.add(new EnrichMissingDatasetReferences()); // matchers.add(new EnrichMissingDatasetReferences());
matchers.add(new EnrichMissingDatasetIsSupplementedTo()); // matchers.add(new EnrichMissingDatasetIsSupplementedTo());
matchers.add(new EnrichMissingDatasetIsSupplementedBy()); // matchers.add(new EnrichMissingDatasetIsSupplementedBy());
matchers.add(new EnrichMissingAbstract()); // matchers.add(new EnrichMissingAbstract());
} }
public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) { public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) {